In [28]:
import pandas as pd

cars = pd.read_table("auto-mpg.data", delim_whitespace=True, header=None)
cars.columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin", "car name"]

In [29]:
unique_regions = cars.origin.unique()
print(unique_regions)

[1 3 2]


In [30]:
cars = pd.concat([cars, pd.get_dummies(cars["cylinders"], prefix="cyl")], axis=1)
cars.head()

cars = pd.concat([cars, pd.get_dummies(cars["year"], prefix="year")], axis=1)
cars.drop(["year", "cylinders"], axis=1, inplace=True)
cars.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,car name,cyl_3,cyl_4,cyl_5,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,chevrolet chevelle malibu,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,15.0,350.0,165.0,3693.0,11.5,1,buick skylark 320,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,18.0,318.0,150.0,3436.0,11.0,1,plymouth satellite,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,16.0,304.0,150.0,3433.0,12.0,1,amc rebel sst,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17.0,302.0,140.0,3449.0,10.5,1,ford torino,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
import numpy as np

shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]

cutoff = int(shuffled_cars.shape[0] * 0.7)
train = shuffled_cars.iloc[:cutoff, :]
test = shuffled_cars.iloc[cutoff:, :]

In [32]:
from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()

models = {}
cars.iloc[:,7:].head()
cars.iloc[:,7:]
#print(unique_origins)
for i in unique_origins:
    lr = LogisticRegression()
    lr.fit(cars.iloc[:,7:], (cars["origin"] == i))
    models[i] = lr

In [46]:
testing_probs = pd.DataFrame(columns=unique_origins)

for i in unique_origins:
    predicted_probs = models[i].predict_proba(test.iloc[:,7:])
    #print(predicted_probs)
    testing_probs[i] = predicted_probs[:,1]
    
testing_probs.head(6)

Unnamed: 0,1,2,3
0,0.577322,0.118657,0.31561
1,0.317659,0.435008,0.245139
2,0.518455,0.285113,0.190713
3,0.276739,0.456875,0.264607
4,0.976868,0.01395,0.027892
5,0.550738,0.442028,0.07268


In [50]:
predicted_origins = testing_probs.idxmax(axis=1)
print(predicted_origins)

0      1
1      2
2      1
3      2
4      1
5      1
6      1
7      1
8      1
9      2
10     1
11     2
12     1
13     1
14     1
15     1
16     1
17     1
18     1
19     3
20     1
21     1
22     1
23     1
24     1
25     1
26     3
27     1
28     1
29     1
      ..
90     1
91     1
92     3
93     1
94     3
95     1
96     2
97     1
98     1
99     1
100    3
101    1
102    1
103    1
104    1
105    3
106    1
107    1
108    1
109    1
110    1
111    2
112    2
113    2
114    1
115    2
116    1
117    2
118    1
119    1
dtype: int64
