In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression


In [2]:
df = pd.read_csv('auto.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [3]:
unique_origins = df['origin'].unique()
unique_origins.sort()
unique_origins

array([1, 2, 3])

In [4]:
df['cylinders'].unique()

array([8, 4, 6, 3, 5])

In [5]:
dummy_cylinders = pd.get_dummies(df['cylinders'],prefix='cyl')
dummy_cylinders.head()

Unnamed: 0,cyl_3,cyl_4,cyl_5,cyl_6,cyl_8
0,0,0,0,0,1
1,0,0,0,0,1
2,0,0,0,0,1
3,0,0,0,0,1
4,0,0,0,0,1


In [6]:
df = pd.concat([df,dummy_cylinders],axis=1)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,cyl_3,cyl_4,cyl_5,cyl_6,cyl_8
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,0,0,0,0,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,0,0,0,0,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,0,0,0,0,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,0,0,0,0,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,0,0,0,0,1


In [7]:
dummy_year = pd.get_dummies(df['year'],prefix='year')
df = pd.concat([df,dummy_year],axis=1)
df = df.drop(['year','cylinders'],axis=1)
df.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
shuffled_rows = np.random.permutation(df.index)
shuffled_df = df.iloc[shuffled_rows]
shuffled_df.head()

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,cyl_3,cyl_4,cyl_5,cyl_6,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
77,21.0,120.0,87.0,2979.0,19.5,2,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,14.0,454.0,220.0,4354.0,9.0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
207,19.0,120.0,88.0,3270.0,21.9,2,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
147,24.0,120.0,97.0,2489.0,15.0,3,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


In [9]:
highest_train_row = int(df.shape[0]*0.7)
train = shuffled_df.iloc[0:highest_train_row]
test = shuffled_df.iloc[highest_train_row:]


In [10]:
models = {}
features = [c for c in train.columns if c.startswith('cyl') or c.startswith('year')]
features

['cyl_3',
 'cyl_4',
 'cyl_5',
 'cyl_6',
 'cyl_8',
 'year_70',
 'year_71',
 'year_72',
 'year_73',
 'year_74',
 'year_75',
 'year_76',
 'year_77',
 'year_78',
 'year_79',
 'year_80',
 'year_81',
 'year_82']

In [11]:
for origin in unique_origins:
    model = LogisticRegression()
    X_train = train[features]
    y_train = train['origin'] == origin
    model.fit(X_train,y_train)
    models[origin] = model
    
models

{1: LogisticRegression(), 2: LogisticRegression(), 3: LogisticRegression()}

In [12]:
testing_probs = pd.DataFrame(columns=unique_origins)
testing_probs

Unnamed: 0,1,2,3


In [13]:
for origin in unique_origins:
    X_test = test[features]
    testing_probs[origin] = models[origin].predict_proba(X_test)[:,1]
    
testing_probs[3]

0      0.024055
1      0.048951
2      0.301825
3      0.209132
4      0.029300
         ...   
113    0.423987
114    0.014853
115    0.346159
116    0.063520
117    0.400041
Name: 3, Length: 118, dtype: float64

In [14]:
predicted_origins = testing_probs.idxmax(axis=1)
print(predicted_origins)

0      1
1      1
2      1
3      1
4      1
      ..
113    3
114    1
115    1
116    1
117    1
Length: 118, dtype: int64
