In [229]:
import numpy as np
import math
import pandas as pd
pd.set_option('display.max_columns', 26)
from sklearn.linear_model import OrthogonalMatchingPursuit as OMP
from sklearn.preprocessing import Normalizer,tests
from sklearn.cross_validation import StratifiedKFold

columnNames = ["symboling","normalized_losses","make","fuel_type","aspiration","num_of_doors","body_style",
               "drive_wheels","engine_location","wheel_base","length","width","height","curb_weight",
               "engine_type","num_of_cylinders","engine_size","fuel_system","bore","stroke",
               "compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"]

carsData = pd.read_table("./imports-85.data", names=columnNames, delimiter=",", index_col=False)
carsData = carsData.convert_objects(convert_numeric=True)
# print carsData.head(n=6)
print carsData.describe()

        symboling  normalized_losses  wheel_base      length       width  \
count  205.000000         164.000000  205.000000  205.000000  205.000000   
mean     0.834146         122.000000   98.756585  174.049268   65.907805   
std      1.245307          35.442168    6.021776   12.337289    2.145204   
min     -2.000000          65.000000   86.600000  141.100000   60.300000   
25%      0.000000          94.000000   94.500000  166.300000   64.100000   
50%      1.000000         115.000000   97.000000  173.200000   65.500000   
75%      2.000000         150.000000  102.400000  183.100000   66.900000   
max      3.000000         256.000000  120.900000  208.100000   72.300000   

           height  curb_weight  engine_size        bore      stroke  \
count  205.000000   205.000000   205.000000  201.000000  201.000000   
mean    53.724878  2555.565854   126.907317    3.329751    3.255423   
std      2.443522   520.680204    41.642693    0.273539    0.316717   
min     47.800000  1488.000000 



In [125]:
print carsData.select_dtypes(exclude=["float64","int","float"]).head()

          make fuel_type aspiration num_of_doors   body_style drive_wheels  \
0  alfa-romero       gas        std          two  convertible          rwd   
1  alfa-romero       gas        std          two  convertible          rwd   
2  alfa-romero       gas        std          two    hatchback          rwd   
3         audi       gas        std         four        sedan          fwd   
4         audi       gas        std         four        sedan          4wd   

  engine_location engine_type num_of_cylinders fuel_system  
0           front        dohc             four        mpfi  
1           front        dohc             four        mpfi  
2           front        ohcv              six        mpfi  
3           front         ohc             four        mpfi  
4           front         ohc             five        mpfi  


In [257]:
'''
function to fill missing values of column by taking dependence on another column to get the most 
appropriate values

Note: we can use nearest neighbours approach to get a better approximation but I will using the below method
        for this challenge
'''
def fill_missing(missingColVlaue, dependentColValue, missingColName, dependentColName):
    
    if not math.isnan(carsData.groupby([dependentColName]).mean()[missingColName][dependentColValue]) \
            and math.isnan(missingColVlaue):
        missingColVlaue = carsData.groupby([dependentColName]).mean()[missingColName][dependentColValue]
    elif math.isnan(carsData.groupby([dependentColName]).mean()[missingColName][dependentColValue]) \
            and math.isnan(missingColVlaue):
        missingColVlaue = carsData[missingColName].mean()
    return missingColVlaue


In [258]:
# count and mean using group by
pd.DataFrame({'mean_norm_loss': carsData.groupby('make').mean().normalized_losses,
             'count': carsData.groupby('make').size()}).reset_index()

Unnamed: 0,make,count,mean_norm_loss
0,alfa-romero,3,122.0
1,audi,7,161.0
2,bmw,8,190.0
3,chevrolet,3,100.0
4,dodge,9,133.444444
5,honda,13,103.0
6,isuzu,4,122.0
7,jaguar,3,145.0
8,mazda,17,123.933333
9,mercedes-benz,8,102.8


In [259]:
# filling missing normalized loss with similar make values
'''
from the above table it is clear that only a very 4 makes that are not having any values in the normalized 
losses column. From my observation cars of similar makes have almost the same normalized losses, so by using 
making use of this observation I am filling the missing values for the normalized losses in the data.

Note: I agree this is not the right thing to do, but for the purpose of this challenge I am proceeding this
      way.
'''

carsData["normalized_losses"] = carsData.apply(lambda x: 
                                    fill_missing(x['normalized_losses'], x['make'],
                                                 "normalized_losses", "make"), axis=1)


In [260]:
print carsData.select_dtypes(include=["float64","int"]).describe()

        symboling  normalized_losses  wheel_base      length       width  \
count  205.000000         205.000000  205.000000  205.000000  205.000000   
mean     0.834146         126.186132   98.756585  174.049268   65.907805   
std      1.245307          35.063201    6.021776   12.337289    2.145204   
min     -2.000000          65.000000   86.600000  141.100000   60.300000   
25%      0.000000         101.000000   94.500000  166.300000   64.100000   
50%      1.000000         122.000000   97.000000  173.200000   65.500000   
75%      2.000000         150.000000  102.400000  183.100000   66.900000   
max      3.000000         256.000000  120.900000  208.100000   72.300000   

           height  curb_weight  engine_size        bore      stroke  \
count  205.000000   205.000000   205.000000  201.000000  201.000000   
mean    53.724878  2555.565854   126.907317    3.329751    3.255423   
std      2.443522   520.680204    41.642693    0.273539    0.316717   
min     47.800000  1488.000000 

In [214]:
def OMPLinearModel(X, y, n):
    """
    X: predictors
    y: prediction
    n: number of non zero coefficients
    """
    lm = OMP(n_nonzero_coefs=n)
    lm.fit(X, y)
    yhat = lm.predict(X)
    print "squared error :", np.square(y - yhat)
    print "R2 value      :", lm.score(X, y)
    

data = carsData.select_dtypes(include=["float64","int"])

X = data.drop(["price"], axis=1)
y = data["price"]

OMPLinearModel(X, y, 4)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').