In [648]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.feature_selection import SelectKBest, RFE
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import FunctionTransformer,Binarizer, Imputer, LabelBinarizer, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline, make_union, FeatureUnion
from sklearn import metrics
%matplotlib inline

In [29]:
df = pd.read_csv('university_train.csv')
df.head()

Unnamed: 0,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,MAIN,NUMBRANCH,PREDDEG,HIGHDEG,CONTROL,...,RELAFFIL,DISTANCEONLY,UGDS,AGE_ENTRY,FEMALE,MARRIED,DEPENDENT,MD_FAMINC,percent_on_student_loan,id_number
0,Panola College,Carthage,TX,75633-2397,Southern Association of Colleges and Schools C...,1,1,1,2,1,...,-2.0,0.0,2056.0,25.927665,0.706853,0.176396,0.479061,19588.54338,26.05,3756
1,Palo Alto University,Palo Alto,CA,94304-1337,Western Association of Schools and Colleges Se...,1,1,3,4,2,...,-2.0,0.0,194.0,29.712871,0.732673,0.188119,0.306931,18157.66968,68.23,449
2,Quincy University,Quincy,IL,62301-2699,North Central Association of Colleges and Scho...,1,1,3,4,2,...,30.0,0.0,1075.0,21.532362,0.543689,0.061489,0.826861,50437.83939,70.23,1121
3,Fortis College-Orange Park,Orange Park,FL,32065,Accrediting Council for Independent Colleges a...,1,2,1,2,3,...,-2.0,0.0,312.0,29.542683,0.847561,0.288618,0.256098,15967.10174,69.55,5343
4,Southwestern Assemblies of God University,Waxahachie,TX,75165,Southern Association of Colleges and Schools C...,1,1,3,4,2,...,27.0,0.0,1656.0,22.794651,0.475645,0.120344,0.763133,41197.69899,74.29,3782


In [6]:
df.dtypes

INSTNM                      object
CITY                        object
STABBR                      object
ZIP                         object
ACCREDAGENCY                object
MAIN                         int64
NUMBRANCH                    int64
PREDDEG                      int64
HIGHDEG                      int64
CONTROL                      int64
LOCALE                     float64
CCUGPROF                   float64
CCSIZSET                   float64
HBCU                       float64
PBI                        float64
MENONLY                    float64
WOMENONLY                  float64
RELAFFIL                   float64
DISTANCEONLY               float64
UGDS                       float64
AGE_ENTRY                  float64
FEMALE                     float64
MARRIED                    float64
DEPENDENT                  float64
MD_FAMINC                  float64
percent_on_student_loan    float64
id_number                    int64
dtype: object

In [7]:
df.shape

(3211, 27)

In [8]:
df.isnull().sum()

INSTNM                     0
CITY                       0
STABBR                     0
ZIP                        0
ACCREDAGENCY               0
MAIN                       0
NUMBRANCH                  0
PREDDEG                    0
HIGHDEG                    0
CONTROL                    0
LOCALE                     0
CCUGPROF                   0
CCSIZSET                   0
HBCU                       0
PBI                        0
MENONLY                    0
WOMENONLY                  0
RELAFFIL                   0
DISTANCEONLY               0
UGDS                       0
AGE_ENTRY                  0
FEMALE                     0
MARRIED                    0
DEPENDENT                  0
MD_FAMINC                  0
percent_on_student_loan    0
id_number                  0
dtype: int64

In [9]:
#Goal is to predict percent_on_student_loans
#Need to create some dummy variables

In [10]:
df['PREDDEG'].value_counts()

1    1336
3    1075
2     793
0       7
Name: PREDDEG, dtype: int64

In [30]:
df = pd.concat([df, pd.get_dummies(df['PREDDEG'], drop_first=True, prefix = 'PREDDEG')], axis = 1)

In [11]:
df['HIGHDEG'].value_counts()

4    949
2    948
1    930
3    372
0     12
Name: HIGHDEG, dtype: int64

In [32]:
df = pd.concat([df,pd.get_dummies(df['HIGHDEG'], drop_first=True, prefix = 'HIGHDEG')], axis = 1)

In [34]:
df['CONTROL'].value_counts()

3    1303
1    1233
2     675
Name: CONTROL, dtype: int64

In [35]:
df = pd.concat([df,pd.get_dummies(df['CONTROL'], drop_first=True, prefix = 'CONTROL')], axis = 1)

In [37]:
df['LOCALE'].value_counts()

 21.0    805
 11.0    723
 13.0    448
 12.0    384
 41.0    203
 32.0    187
 33.0    165
 22.0     81
 31.0     66
 23.0     60
 42.0     55
 43.0     32
-3.0       2
Name: LOCALE, dtype: int64

In [40]:
df = df.loc[df['LOCALE'] != -3.0]
#-3 is not documented as any locale type, so we drop it

In [42]:
df = pd.concat([df,pd.get_dummies(df['LOCALE'], drop_first=True,prefix = 'LOCALE')], axis = 1)

In [64]:
df.head()

Unnamed: 0,INSTNM,CITY,STABBR,ZIP,ACCREDAGENCY,MAIN,NUMBRANCH,PREDDEG,HIGHDEG,CONTROL,...,LOCALE_13.0,LOCALE_21.0,LOCALE_22.0,LOCALE_23.0,LOCALE_31.0,LOCALE_32.0,LOCALE_33.0,LOCALE_41.0,LOCALE_42.0,LOCALE_43.0
0,Panola College,Carthage,TX,75633-2397,Southern Association of Colleges and Schools C...,1,1,1,2,1,...,0,0,0,0,0,1,0,0,0,0
1,Palo Alto University,Palo Alto,CA,94304-1337,Western Association of Schools and Colleges Se...,1,1,3,4,2,...,0,0,0,0,0,0,0,1,0,0
2,Quincy University,Quincy,IL,62301-2699,North Central Association of Colleges and Scho...,1,1,3,4,2,...,0,0,0,0,0,0,1,0,0,0
3,Fortis College-Orange Park,Orange Park,FL,32065,Accrediting Council for Independent Colleges a...,1,2,1,2,3,...,0,1,0,0,0,0,0,0,0,0
4,Southwestern Assemblies of God University,Waxahachie,TX,75165,Southern Association of Colleges and Schools C...,1,1,3,4,2,...,0,1,0,0,0,0,0,0,0,0


In [72]:
df.drop('LOCALE', axis = 1, inplace = True)

In [73]:
df.drop('CONTROL', axis = 1, inplace = True)

In [74]:
df.drop('HIGHDEG', axis = 1, inplace = True)

In [76]:
df.drop('PREDDEG', axis = 1, inplace = True)

In [95]:
df.drop('MENONLY', axis = 1, inplace = True)

In [48]:
df['CCUGPROF'].value_counts()
#questionable to include this, as 30% of the values is not -2.0 (not applicable)

-2.0     945
 1.0     336
 11.0    278
 5.0     250
 2.0     249
 13.0    211
 4.0     179
 7.0     169
 3.0     143
 10.0     90
 9.0      90
 14.0     90
 15.0     77
 12.0     58
 6.0      38
 8.0       4
 0.0       2
Name: CCUGPROF, dtype: int64

In [49]:
df['CCSIZSET'].value_counts()
#same as above

-2.0     945
 6.0     346
 2.0     270
 1.0     251
 3.0     225
 9.0     143
 12.0    132
 11.0    119
 10.0    118
 13.0    112
 4.0     110
 15.0     93
 16.0     90
 14.0     72
 7.0      57
 8.0      55
 5.0      51
 17.0     20
Name: CCSIZSET, dtype: int64

In [51]:
df['UGDS'].value_counts().head()

119.0    10
131.0    10
114.0     9
109.0     9
94.0      9
Name: UGDS, dtype: int64

In [53]:
df['AGE_ENTRY'].value_counts().head()

28.480271    26
31.763343    25
29.129327    21
29.893959    18
29.343011    17
Name: AGE_ENTRY, dtype: int64

In [56]:
df['FEMALE'].value_counts().head()

0.695178    26
0.697885    25
0.773162    21
0.609785    18
0.785069    17
Name: FEMALE, dtype: int64

In [59]:
df['MARRIED'].value_counts().head()

0.173287    26
0.294515    25
0.180720    21
0.197571    18
0.170113    17
Name: MARRIED, dtype: int64

In [61]:
df['DEPENDENT'].value_counts().head()

0.295522    26
0.087442    25
0.246662    21
0.284949    18
0.243902    17
Name: DEPENDENT, dtype: int64

In [79]:
df['MD_FAMINC'].value_counts().head()

0.00000        62
14678.55989    26
16789.83113    25
13449.66401    21
15149.50454    18
Name: MD_FAMINC, dtype: int64

In [92]:
df['RELAFFIL'].nunique()
#Religious affiliation

47

In [569]:
df['HBCU'].value_counts()

0.0    3157
1.0      52
Name: HBCU, dtype: int64

In [570]:
df['PBI'].value_counts()

0.0    3148
1.0      61
Name: PBI, dtype: int64

In [78]:
df.columns

Index([u'INSTNM', u'CITY', u'STABBR', u'ZIP', u'ACCREDAGENCY', u'MAIN',
       u'NUMBRANCH', u'CCUGPROF', u'CCSIZSET', u'HBCU', u'PBI', u'MENONLY',
       u'WOMENONLY', u'RELAFFIL', u'DISTANCEONLY', u'UGDS', u'AGE_ENTRY',
       u'FEMALE', u'MARRIED', u'DEPENDENT', u'MD_FAMINC',
       u'percent_on_student_loan', u'id_number', u'PREDDEG_1', u'PREDDEG_2',
       u'PREDDEG_3', u'HIGHDEG_1', u'HIGHDEG_2', u'HIGHDEG_3', u'HIGHDEG_4',
       u'CONTROL_2', u'CONTROL_3', u'LOCALE_12.0', u'LOCALE_13.0',
       u'LOCALE_21.0', u'LOCALE_22.0', u'LOCALE_23.0', u'LOCALE_31.0',
       u'LOCALE_32.0', u'LOCALE_33.0', u'LOCALE_41.0', u'LOCALE_42.0',
       u'LOCALE_43.0'],
      dtype='object')

In [96]:
features = ['MAIN','NUMBRANCH','HBCU','PBI','WOMENONLY','DISTANCEONLY','UGDS','AGE_ENTRY','FEMALE',
           'MARRIED','DEPENDENT','MD_FAMINC','PREDDEG_1','PREDDEG_2','PREDDEG_3','HIGHDEG_1','HIGHDEG_2','HIGHDEG_3',
            'HIGHDEG_4','CONTROL_2','CONTROL_3','LOCALE_12.0','LOCALE_13.0','LOCALE_21.0','LOCALE_22.0','LOCALE_23.0',
            'LOCALE_31.0','LOCALE_32.0','LOCALE_33.0','LOCALE_41.0','LOCALE_42.0','LOCALE_43.0']

In [97]:
X = df[features]
y = df['percent_on_student_loan'].values

SelectKBest
--

In [538]:
kbest = SelectKBest(k=32)

In [539]:
kbest_columns = kbest.fit_transform(X,y)

In [540]:
print kbest_columns.shape 
print kbest.scores_ 
print kbest.pvalues_ 

(3209, 32)
[ 1.20452391  1.14537717  0.98433551  0.67426532  0.83968381  5.36661529
  2.2426503   1.23308538  1.02370241  0.93903746  1.38396973  1.58238101
  1.2253044   1.32736705  1.74522891  0.93072866  1.2820728   1.31300388
  1.74743733  1.2848572   1.31617637  1.13175023  1.02101426  1.02576809
  0.93924093  1.13598842  0.97789625  0.84498721  0.93286152  0.75688296
  0.42473854  0.31127276]
[  8.99034129e-004   1.11660197e-002   6.10500037e-001   1.00000000e+000
   9.98833256e-001   4.41274207e-132   7.85863201e-038   2.25205371e-004
   3.48312687e-001   8.62615102e-001   3.52811460e-008   2.72435175e-014
   3.31684849e-004   1.21425927e-006   7.61940124e-020   8.93611260e-001
   1.67610405e-005   2.85160430e-006   6.37552240e-020   1.43471419e-005
   2.36542275e-006   1.85545120e-002   3.65022863e-001   3.35701357e-001
   8.61790075e-001   1.58925134e-002   6.52946395e-001   9.98323083e-001
   8.86154150e-001   9.99999490e-001   1.00000000e+000   1.00000000e+000]


In [541]:
for col_name, score in zip(X.columns, kbest.scores_):
    print col_name, ' : ', score
#to see column and its KBest Score

MAIN  :  1.20452391348
NUMBRANCH  :  1.14537716547
HBCU  :  0.984335510326
PBI  :  0.674265316
WOMENONLY  :  0.839683808861
DISTANCEONLY  :  5.36661528808
UGDS  :  2.24265030198
AGE_ENTRY  :  1.23308538112
FEMALE  :  1.02370241088
MARRIED  :  0.939037459574
DEPENDENT  :  1.38396973182
MD_FAMINC  :  1.58238100622
PREDDEG_1  :  1.22530439643
PREDDEG_2  :  1.32736705227
PREDDEG_3  :  1.74522890624
HIGHDEG_1  :  0.930728657623
HIGHDEG_2  :  1.28207280027
HIGHDEG_3  :  1.31300387599
HIGHDEG_4  :  1.74743732993
CONTROL_2  :  1.28485720039
CONTROL_3  :  1.31617637203
LOCALE_12.0  :  1.13175023496
LOCALE_13.0  :  1.02101425646
LOCALE_21.0  :  1.02576808682
LOCALE_22.0  :  0.939240934447
LOCALE_23.0  :  1.13598841753
LOCALE_31.0  :  0.977896252485
LOCALE_32.0  :  0.844987211139
LOCALE_33.0  :  0.932861518549
LOCALE_41.0  :  0.7568829623
LOCALE_42.0  :  0.424738540798
LOCALE_43.0  :  0.3112727574


In [542]:
mask = kbest.get_support() #list of booleans
new_features = [] # The list of your K best features

for bool, feature in zip(mask, features):
    if bool:
        new_features.append(feature)
#to help create a new dataframe with KBest columns

In [543]:
dfkbest = pd.DataFrame(kbest_columns, columns = new_features)
dfkbest.head()

Unnamed: 0,MAIN,NUMBRANCH,HBCU,PBI,WOMENONLY,DISTANCEONLY,UGDS,AGE_ENTRY,FEMALE,MARRIED,...,LOCALE_13.0,LOCALE_21.0,LOCALE_22.0,LOCALE_23.0,LOCALE_31.0,LOCALE_32.0,LOCALE_33.0,LOCALE_41.0,LOCALE_42.0,LOCALE_43.0
0,1.0,1.0,0.0,0.0,0.0,0.0,2056.0,25.927665,0.706853,0.176396,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.0,0.0,0.0,0.0,194.0,29.712871,0.732673,0.188119,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,1.0,0.0,0.0,0.0,0.0,1075.0,21.532362,0.543689,0.061489,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,2.0,0.0,0.0,0.0,0.0,312.0,29.542683,0.847561,0.288618,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,0.0,0.0,0.0,0.0,1656.0,22.794651,0.475645,0.120344,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [544]:
lm = LinearRegression()

In [545]:
model = lm.fit(dfkbest,y)
model.coef_, model.intercept_

(array([ -5.82043015e+00,  -2.59865175e-01,   3.03920246e+01,
          7.56573573e+00,   4.83616607e+00,  -2.11350822e+01,
         -2.85743173e-04,  -1.54207784e-01,   1.22428712e+01,
         -7.47510312e+01,  -5.87765444e+01,   6.66092639e-04,
         -2.27961865e+01,  -1.86730505e+01,  -1.28460189e+01,
          1.24693158e+00,   3.25713296e+00,   1.28035021e+01,
          1.00554521e+01,   1.46840007e+01,   3.18718991e+01,
          2.68424977e+00,   4.45378914e+00,   2.86954854e+00,
          1.62984020e+00,   1.95867177e+00,   8.27909225e+00,
          6.36547479e+00,   6.97199044e+00,   4.18034074e+00,
         -2.01090556e+00,  -7.86687016e+00]), 69.674333421099249)

In [546]:
predictions = model.predict(dfkbest)
scores = cross_val_score(model, dfkbest, y, cv=5)
scores

array([ 0.45306362,  0.4875242 ,  0.46988523,  0.41282821,  0.43871388])

In [547]:
print 'R^2:', model.score(dfkbest, y)
print 'RMSE:', np.sqrt(metrics.mean_squared_error(y, predictions))

R^2: 0.465118136625
RMSE: 20.4832683465


In [548]:
lasso = Lasso(alpha =0.01)

In [549]:
modellasso = lasso.fit(dfkbest,y)
print modellasso.score(dfkbest,y)
predictionlasso = modellasso.predict(dfkbest)
print np.sqrt(metrics.mean_squared_error(y, predictionlasso))

0.4645997972
20.4931908315


In [550]:
ridge = Ridge(alpha = 0.000000001)

In [551]:
modelridge = ridge.fit(dfkbest,y)
print modelridge.score(dfkbest,y)
predictionridge = modelridge.predict(dfkbest)
print np.sqrt(metrics.mean_squared_error(y, predictionridge))

0.465118136625
20.4832683465


RFE
--

In [552]:
rfe = RFE(lm,n_features_to_select=32)

In [553]:
rfe.fit(X,y)
print rfe.n_features_
print rfe.support_
print rfe.ranking_

32
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [554]:
mask1 = rfe.get_support() #list of booleans
new_features1 = [] # The list of your K best features

for bool, feature in zip(mask1, features):
    if bool:
        new_features1.append(feature)
new_features1

['MAIN',
 'NUMBRANCH',
 'HBCU',
 'PBI',
 'WOMENONLY',
 'DISTANCEONLY',
 'UGDS',
 'AGE_ENTRY',
 'FEMALE',
 'MARRIED',
 'DEPENDENT',
 'MD_FAMINC',
 'PREDDEG_1',
 'PREDDEG_2',
 'PREDDEG_3',
 'HIGHDEG_1',
 'HIGHDEG_2',
 'HIGHDEG_3',
 'HIGHDEG_4',
 'CONTROL_2',
 'CONTROL_3',
 'LOCALE_12.0',
 'LOCALE_13.0',
 'LOCALE_21.0',
 'LOCALE_22.0',
 'LOCALE_23.0',
 'LOCALE_31.0',
 'LOCALE_32.0',
 'LOCALE_33.0',
 'LOCALE_41.0',
 'LOCALE_42.0',
 'LOCALE_43.0']

In [555]:
rfedf = X[new_features1]
rfedf.head()

Unnamed: 0,MAIN,NUMBRANCH,HBCU,PBI,WOMENONLY,DISTANCEONLY,UGDS,AGE_ENTRY,FEMALE,MARRIED,...,LOCALE_13.0,LOCALE_21.0,LOCALE_22.0,LOCALE_23.0,LOCALE_31.0,LOCALE_32.0,LOCALE_33.0,LOCALE_41.0,LOCALE_42.0,LOCALE_43.0
0,1,1,0.0,0.0,0.0,0.0,2056.0,25.927665,0.706853,0.176396,...,0,0,0,0,0,1,0,0,0,0
1,1,1,0.0,0.0,0.0,0.0,194.0,29.712871,0.732673,0.188119,...,0,0,0,0,0,0,0,1,0,0
2,1,1,0.0,0.0,0.0,0.0,1075.0,21.532362,0.543689,0.061489,...,0,0,0,0,0,0,1,0,0,0
3,1,2,0.0,0.0,0.0,0.0,312.0,29.542683,0.847561,0.288618,...,0,1,0,0,0,0,0,0,0,0
4,1,1,0.0,0.0,0.0,0.0,1656.0,22.794651,0.475645,0.120344,...,0,1,0,0,0,0,0,0,0,0


In [556]:
model1 = lm.fit(rfedf,y)
predictions1 = model1.predict(rfedf)
scores = cross_val_score(model1, rfedf, y, cv=5)
scores

array([ 0.45306362,  0.4875242 ,  0.46988523,  0.41282821,  0.43871388])

In [557]:
print 'R^2:', model1.score(rfedf,y)
print model1.coef_
print 'RMSE:', np.sqrt(metrics.mean_squared_error(y, predictions1))

R^2: 0.465118136625
[ -5.82043015e+00  -2.59865175e-01   3.03920246e+01   7.56573573e+00
   4.83616607e+00  -2.11350822e+01  -2.85743173e-04  -1.54207784e-01
   1.22428712e+01  -7.47510312e+01  -5.87765444e+01   6.66092639e-04
  -2.27961865e+01  -1.86730505e+01  -1.28460189e+01   1.24693158e+00
   3.25713296e+00   1.28035021e+01   1.00554521e+01   1.46840007e+01
   3.18718991e+01   2.68424977e+00   4.45378914e+00   2.86954854e+00
   1.62984020e+00   1.95867177e+00   8.27909225e+00   6.36547479e+00
   6.97199044e+00   4.18034074e+00  -2.01090556e+00  -7.86687016e+00]
RMSE: 20.4832683465


In [565]:
lasso = Lasso(alpha =.001)

In [566]:
modellasso = lasso.fit(rfedf,y)
print modellasso.score(rfedf,y)
predictionlasso = modellasso.predict(rfedf)
print np.sqrt(metrics.mean_squared_error(y, predictionlasso))

0.465086301596
20.4838778977


In [None]:
ridge = Ridge(alpha = 0.000000001)

In [567]:
modelridge = ridge.fit(rfedf,y)
print modelridge.score(rfedf,y)
predictionridge = modelridge.predict(rfedf)
print np.sqrt(metrics.mean_squared_error(y, predictionridge))

0.465118136625
20.4832683465


GridSearchCV
--

In [239]:
hyperparameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}

In [238]:
estimator = lm
estimator.get_params().keys()

['copy_X', 'normalize', 'n_jobs', 'fit_intercept']

In [240]:
grid_search = GridSearchCV(lm, hyperparameters)
grid_search.fit(X,y)

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'normalize': [True, False], 'copy_X': [True, False], 'fit_intercept': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [241]:
zip(features,grid_search.best_estimator_.coef_.reshape(-1,1))

[('MAIN', array([-5.82043015])),
 ('NUMBRANCH', array([-0.25986517])),
 ('HBCU', array([ 30.39202457])),
 ('PBI', array([ 7.56573573])),
 ('WOMENONLY', array([ 4.83616607])),
 ('DISTANCEONLY', array([-21.13508217])),
 ('UGDS', array([-0.00028574])),
 ('AGE_ENTRY', array([-0.15420778])),
 ('FEMALE', array([ 12.24287117])),
 ('MARRIED', array([-74.75103116])),
 ('DEPENDENT', array([-58.77654444])),
 ('MD_FAMINC', array([ 0.00066609])),
 ('PREDDEG_1', array([-22.79618652])),
 ('PREDDEG_2', array([-18.67305051])),
 ('PREDDEG_3', array([-12.84601886])),
 ('HIGHDEG_1', array([ 1.24693158])),
 ('HIGHDEG_2', array([ 3.25713296])),
 ('HIGHDEG_3', array([ 12.80350212])),
 ('HIGHDEG_4', array([ 10.05545214])),
 ('CONTROL_2', array([ 14.6840007])),
 ('CONTROL_3', array([ 31.87189913])),
 ('LOCALE_12.0', array([ 2.68424977])),
 ('LOCALE_13.0', array([ 4.45378914])),
 ('LOCALE_21.0', array([ 2.86954854])),
 ('LOCALE_22.0', array([ 1.6298402])),
 ('LOCALE_23.0', array([ 1.95867177])),
 ('LOCALE_31.0'

In [245]:
best_features = []
for i,k in zip(grid_search.best_estimator_.coef_, X.columns):
    if abs(i) > 0.5:
        best_features.append(k)

In [246]:
gsdf = X[best_features]

In [247]:
best_features

['MAIN',
 'HBCU',
 'PBI',
 'WOMENONLY',
 'DISTANCEONLY',
 'FEMALE',
 'MARRIED',
 'DEPENDENT',
 'PREDDEG_1',
 'PREDDEG_2',
 'PREDDEG_3',
 'HIGHDEG_1',
 'HIGHDEG_2',
 'HIGHDEG_3',
 'HIGHDEG_4',
 'CONTROL_2',
 'CONTROL_3',
 'LOCALE_12.0',
 'LOCALE_13.0',
 'LOCALE_21.0',
 'LOCALE_22.0',
 'LOCALE_23.0',
 'LOCALE_31.0',
 'LOCALE_32.0',
 'LOCALE_33.0',
 'LOCALE_41.0',
 'LOCALE_42.0',
 'LOCALE_43.0']

In [248]:
model2 = grid_search.best_estimator_.fit(gsdf,y)
predictions2 = model2.predict(gsdf)
scores = cross_val_score(model2, gsdf, y, cv=5)
scores

array([ 0.39178039,  0.42862909,  0.40695938,  0.35586195,  0.35894649])

In [249]:
print 'R^2:', model2.score(gsdf,y)
print model2.coef_
print 'RMSE:', np.sqrt(metrics.mean_squared_error(y, predictions2))

R^2: 0.400171192778
[ -3.97320625  20.08562472   3.95345355   1.8097741  -19.58840473
  16.04859906 -49.28044771 -18.70724398 -21.37307046 -17.92913374
  -9.53027711  -0.93459063  -0.16184311  10.58535572  10.05946714
  19.83304073  32.74052175   3.61613959   4.79147358   3.70890675
   3.36195342   3.98915209   9.80937986   7.24113189   8.43633872
   5.8655354   -1.90795705  -7.90396625]
RMSE: 21.6912199501


In [252]:
#Make a pipeline using SelectKBest

In [568]:
dfkbest.columns

Index([u'MAIN', u'NUMBRANCH', u'HBCU', u'PBI', u'WOMENONLY', u'DISTANCEONLY',
       u'UGDS', u'AGE_ENTRY', u'FEMALE', u'MARRIED', u'DEPENDENT',
       u'MD_FAMINC', u'PREDDEG_1', u'PREDDEG_2', u'PREDDEG_3', u'HIGHDEG_1',
       u'HIGHDEG_2', u'HIGHDEG_3', u'HIGHDEG_4', u'CONTROL_2', u'CONTROL_3',
       u'LOCALE_12.0', u'LOCALE_13.0', u'LOCALE_21.0', u'LOCALE_22.0',
       u'LOCALE_23.0', u'LOCALE_31.0', u'LOCALE_32.0', u'LOCALE_33.0',
       u'LOCALE_41.0', u'LOCALE_42.0', u'LOCALE_43.0'],
      dtype='object')

In [609]:
def main(dataframe):
    return dataframe['MAIN'].values.reshape(-1,1)

def branch(dataframe):
    return dataframe['NUMBRANCH'].values.reshape(-1,1)

def hbcu(dataframe):
    return dataframe['HBCU'].values.reshape(-1,1)

def pbi(dataframe):
    return dataframe['PBI'].values.reshape(-1,1)

def women(dataframe):
    return dataframe['WOMENONLY'].values.reshape(-1,1)

def distance(dataframe):
    return dataframe['DISTANCEONLY'].values.reshape(-1,1)

def ugds(dataframe):
    return dataframe['UGDS'].values.reshape(-1,1)

def ageentry(dataframe):
    return dataframe['AGE_ENTRY'].values.reshape(-1,1)

def female(dataframe):
    return dataframe['FEMALE'].values.reshape(-1,1)

def married(dataframe):
    return dataframe['MARRIED'].values.reshape(-1,1)

def dependent(dataframe):
    return dataframe['DEPENDENT'].values.reshape(-1,1)

def mdfam(dataframe):
    return dataframe['MD_FAMINC'].values.reshape(-1,1)

def preddeg(dataframe):
    return dataframe['PREDDEG'].values.reshape(-1,1)

def columnisopred(series):
    return series[:,1:]

def highdeg(dataframe):
    return dataframe['HIGHDEG'].values.reshape(-1,1)

def control(dataframe):
    return dataframe['CONTROL'].values.reshape(-1,1)

def locale(dataframe):
#     dataframe['LOCALE'] = dataframe['LOCALE'].apply(lambda x: 21.0 if x == -3.0 else x)
    return dataframe['LOCALE'].values.reshape(-1,1)

def columnisoloc(series):
    return series[:,2:]

In [626]:
main_pipe = make_pipeline(FunctionTransformer(main, validate = False),
                         Imputer(strategy = 'most_frequent'))

branch_pipe = make_pipeline(FunctionTransformer(branch, validate = False),
                           Imputer())

hbcu_pipe = make_pipeline(FunctionTransformer(hbcu, validate = False),
                         Imputer(strategy = 'most_frequent'))

pbi_pipe = make_pipeline(FunctionTransformer(pbi, validate = False),
                        Imputer(strategy = 'most_frequent'))

women_pipe = make_pipeline(FunctionTransformer(women, validate = False),
                          Imputer(strategy = 'most_frequent'))

distance_pipe = make_pipeline(FunctionTransformer(distance, validate = False),
                             Imputer(strategy = 'most_frequent'))

ugds_pipe = make_pipeline(FunctionTransformer(distance, validate = False),
                         Imputer())

ageentry_pipe = make_pipeline(FunctionTransformer(ageentry, validate = False),
                             Imputer())

female_pipe = make_pipeline(FunctionTransformer(female, validate = False),
                           Imputer())

married_pipe = make_pipeline(FunctionTransformer(married, validate = False),
                            Imputer())

dependent_pipe = make_pipeline(FunctionTransformer(dependent, validate = False),
                              Imputer())

mdfam_pipe = make_pipeline(FunctionTransformer(mdfam, validate = False),
                          Imputer())

preddeg_pipe = make_pipeline(FunctionTransformer(preddeg, validate = False),
                             Imputer(),
                            LabelBinarizer(),
                            FunctionTransformer(columnisopred, validate = False))

highdeg_pipe = make_pipeline(FunctionTransformer(highdeg, validate = False),
                             Imputer(),
                            LabelBinarizer(),
                            FunctionTransformer(columnisopred, validate = False))

control_pipe = make_pipeline(FunctionTransformer(control, validate = False),
                             Imputer(),
                            LabelBinarizer(),
                            FunctionTransformer(columnisopred, validate = False))

locale_pipe = make_pipeline(FunctionTransformer(locale, validate = False),
                            Imputer(),
                           LabelBinarizer(),
                           FunctionTransformer(columnisoloc, validate = False))

In [608]:
lb = LabelBinarizer()
hi=lb.fit_transform(training['LOCALE'])
lb.classes_
#to test which column to isolate

array([ -3.,  11.,  12.,  13.,  21.,  22.,  23.,  31.,  32.,  33.,  41.,
        42.,  43.])

In [577]:
test = pd.read_csv('university_test (1).csv')

In [581]:
training = pd.read_csv('university_train.csv')

In [627]:
fu = make_union(main_pipe,branch_pipe, hbcu_pipe, pbi_pipe,women_pipe, distance_pipe,ugds_pipe,ageentry_pipe,
               female_pipe, married_pipe, dependent_pipe, dependent_pipe, mdfam_pipe, preddeg_pipe,
               highdeg_pipe, control_pipe, locale_pipe)

In [628]:
fu.fit(training)

FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('functiontransformer', FunctionTransformer(accept_sparse=False, func=<function main at 0x11d1648c0>,
          inv_kw_args=None, inverse_func=None, kw_args=None, pass_y=False,
          validate=False)), ('imputer', Imputer(axis=0, copy=True, missing...48>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False))]))],
       transformer_weights=None)

In [643]:
ridge = Ridge(alpha = 0.1)

In [633]:
y = training['percent_on_student_loan'].values

In [644]:
ridge.fit(fu.transform(training),y)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [645]:
ridge.score(fu.transform(training),y)

0.46387011721740318

In [637]:
lm = LinearRegression()
lm.fit(fu.transform(training),y)
lm.score(fu.transform(training),y)

0.46387926458217826

In [646]:
def evaluation_transformation(dataset, predictions):
    dataset = dataset.join(pd.DataFrame(predictions, columns=['Prediction']))
    dataset[['id_number', 'Prediction']].to_csv('submissionr.csv', index=False)
    
predictions = ridge.predict(fu.transform(test))

evaluation_transformation(test, predictions)

In [642]:
def evaluation_transformationlm(dataset, predictions):
    dataset = dataset.join(pd.DataFrame(predictions, columns=['Prediction']))
    dataset[['id_number', 'Prediction']].to_csv('submissionlm.csv', index=False)
    
predictionlm = lm.predict(fu.transform(test))

evaluation_transformationlm(test, predictionlm)