# Splitting data to avoid overfitting!


A way to avoid overfitting is by splitting your data randomly into two subsets:

* Train data: The data that we fit our model on and decide the way in which predictions are made

* Test data: The data that we will use to check how well we are doing

Since we did not fit our model with the test data, the goodness of fit results (our score or $R^2$ lets us know how well the model we fitted works on data it has never seen.

This split is why we usually say "we are training the model" meaning we fit on training data. 


We will do one more exploration with the 2016 election data. 

In [None]:
import pandas as pd
import numpy as np
from plotnine import *
%matplotlib inline
from sklearn.neighbors import KNeighborsRegressor as knn

In [None]:
#Let's load the election dataframe

url = 'https://raw.githubusercontent.com/mwaugh0328/Data_Bootcamp_Fall_2017/master/data_bootcamp_1127/trump_data.csv'
df = pd.read_csv(url,encoding='latin-1',index_col=0)
df = df.dropna(subset=['income'])
df['lnpop'] = np.log(df['population'])
df.head()


In [None]:
#Let's do a really overfitting model, let's use just 2 
#neighbors!
sklearn_knn = knn(n_neighbors=2).fit(X=df[['lnpop']],y=df['trump_share'])

#Let's get our fit score

sklearn_knn.score(X=df[['lnpop']],y=df['trump_share'])


In [None]:
df['pred_knn'] = sklearn_knn.predict(df[['lnpop']])


In [None]:
#plot
(ggplot(df, aes(x='lnpop', y='trump_share'))+
       geom_point(color='steelblue')+
 geom_line(aes(x='lnpop', y='pred_knn'), color='darkorange' )+
 theme_bw()
       )


In [None]:
#This is the module we will need

from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(df,
                               test_size=0.2, 
                               random_state=42,
                               shuffle=True)


In [None]:
print("The size of the election dataset", df.shape[0])
print("The size of the train dataset", train.shape[0])
print("The size of the test dataset",test.shape[0] )

In [None]:
#Let's use a 5neighbor model to see how it works

#For the train data
knn(n_neighbors=5).fit(train[['lnpop']],
                       train['trump_share']).score(train[['lnpop']],
                                                   train['trump_share'])

In [None]:
#And fitting on the train data and scoring on the test data

knn(n_neighbors=5).fit(train[['lnpop']],
                       train['trump_share']).score(test[['lnpop']],
                                                   test['trump_share'])



In [None]:
#Let's look at the scores for different numbers of neighbors

scoretrain=[knn(n_neighbors=i).fit(train[['lnpop']],
            train['trump_share']).score(train[['lnpop']],
            train['trump_share']) for i in list(range(5, 500, 2))]


In [None]:
trainscore=pd.DataFrame(scoretrain, columns=["score"])

trainscore['datsplit']='Training'
trainscore['neighbors']=list(range(5, 500, 2))

In [None]:
trainscore.head()

In [None]:
#Now, let's look at the scores on the testing split

scoretest=[knn(n_neighbors=i).fit(train[['lnpop']],
            train['trump_share']).score(test[['lnpop']],
            test['trump_share']) for i in list(range(5, 500, 2))]




In [None]:
testscore=pd.DataFrame(scoretest, columns=["score"])

testscore['datsplit']='Testing'

testscore['neighbors']=list(range(5, 500, 2))

In [None]:
testscore.head()

In [None]:
score_df=pd.concat([trainscore,testscore], axis=0)

score_df.tail()

In [None]:
(ggplot(score_df, aes(x='neighbors', y='score', color='datsplit'))+
geom_line()+
scale_color_manual(values=['darkorange', 'steelblue'])+
 theme_bw()
)

In [None]:
(ggplot(testscore, aes(x='neighbors', y='score'))+
geom_line(color='darkorange')+
 theme_bw()
)

In [None]:
testscore['score'].max()

In [None]:
#What is the optimal number of neighbors
testscore.loc[testscore['score']==testscore['score'].max(), 
              'neighbors']

Could we do this in a more automatic way that is not totally dependent on getting lucky that the split is representative?


#### Train_test_split

What we have done

Shuffle


2 groups: A (train), B (test)



To find the hyperparameters that give us the best result, we get the scores of this split by testing several and finding the one that does best


#### Cross_validation

Shuffle

4 groups: A, B, C, D

1) train on A,B,C ; test on D


2) train on A,B,D ; test on C


3) train on A,C,D ; test on B


4) train on B,C,D ; test on A


To find the hyperparameters that are optimal we find the scores for a large range of hyperparameters, find the average of all the test scores and we use set the means for each. Then our best hyperparameter value will be the one matched to that mean.

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
#Let's do crossvalidation scores for a single value 
# (10 neighbors) and 5 groups

cross_val_score(knn(n_neighbors=10),X=df[['lnpop']],
                y=df['trump_share'],cv=5)


In [None]:
#Let's obtain the mean

cross_val_score(knn(n_neighbors=10),X=df[['lnpop']],
                y=df['trump_share'],cv=5).mean()


In [None]:
scoreCV=[cross_val_score(knn(n_neighbors=i),X=df[['lnpop']],
                y=df['trump_share'],
                         cv=5).mean() for i in list(range(5, 500))]



In [None]:
df_CV=pd.DataFrame(scoreCV, columns=['scores'])
df_CV['neighbors']=list(range(5, 500))

df_CV.head()

In [None]:
(ggplot(df_CV, aes(x='neighbors', y='scores'))+
 geom_line(color='darkorange')+
theme_bw())

In [None]:
#What is the max score?

df_CV.scores.max()

In [None]:
#Let's get the optimal neighbors

df_CV.loc[df_CV['scores']==df_CV.scores.max(), 'neighbors']

We have 130 neighbors!!!


In [None]:
#Let's drop the prediction column

df.drop(columns=['pred_knn'], inplace=True)

In [None]:
#Let's replace it with the 130 neighbors

df['pred_knn']=knn(n_neighbors=130).fit(X=df[['lnpop']],
                       y=df['trump_share']).predict(df[['lnpop']])


In [None]:
(ggplot(df, aes(x='lnpop', y='trump_share'))+
       geom_point(color='steelblue')+
 geom_line(aes(x='lnpop', y='pred_knn'), color='darkorange' )+
 theme_bw()
       )

In [None]:
 from sklearn.ensemble import RandomForestRegressor as rf

In [None]:
# We will set the number of estimators fixed at 100
# and iterate in the tree depth up to seven


scoresrf=[cross_val_score(rf(n_estimators=100,max_depth=i),
                          df[['lnpop']],df['trump_share'],
                          cv=5).mean() for i in list(range(1,7))
    ]

In [None]:
rf_df=pd.DataFrame(scoresrf, columns=['scores'])

rf_df['depth']=list(range(1,7))

In [None]:
rf_df

In [None]:
(ggplot(rf_df, aes(x='depth', y='scores'))+
geom_line(color='darkorange')+
theme_bw())

In [None]:
#Let's find the maximum depth

rf_df.loc[rf_df['scores']==rf_df.scores.max(), 'depth']

In [None]:
# We see that the max depth of 3 works well!

df['pred_rf']=rf(n_estimators=100,max_depth=3).fit(X=df[['lnpop']],
                       y=df['trump_share']).predict(df[['lnpop']])


In [None]:
(ggplot(df, aes(x='lnpop', y='trump_share'))+
       geom_point(color='steelblue')+
 geom_line(aes(x='lnpop', y='pred_rf'), color='darkorange' )+
 theme_bw()
       )

In [None]:
#prepare dataframe

plotdf=df[['lnpop', 'trump_share', 'pred_knn', 'pred_rf']]

In [None]:
plotdf1=pd.melt(plotdf, id_vars=['lnpop', 'trump_share'],
                value_vars=["pred_knn", "pred_rf"],
                var_name='model', value_name="predictions")

In [None]:
plotdf1.head()

In [None]:
#Let's see how the optimal rnn and optimal rf match!


(ggplot(plotdf1, aes(x='lnpop', y='trump_share', color='model'))+
 geom_point(color='lightgreen')+
geom_line(aes(x='lnpop', y='predictions'), alpha=0.7, size=2)+
 scale_color_manual(values=['darkorange', 'steelblue'])+
theme_bw())

In [None]:
(ggplot(df, aes(x='pred_knn', y='pred_rf'))+
       geom_point(color='darkorange', size=2)+
 geom_smooth(color='blue')+
theme_bw())

In [None]:
dfsq=df[['trump_share', 'pred_knn', 'pred_rf']].corr()**2
dfsq

In [None]:
#Let's make a heatmap

listoflist = []
for in1 in dfsq.index:
    for in2 in dfsq.columns:
        listoflist.append([in1, in2, dfsq.loc[in1, in2]])

heatdf = pd.DataFrame(data=listoflist, columns=["att1", "att2", "R2"])

heatdf

In [None]:
(ggplot(data=heatdf, mapping= aes(x='att1', y='att2', fill='R2'))+
 geom_tile()+
 geom_text(aes(x='att1', y='att2', label='R2'),
           format_string='{:.2f}',
          color='red')
 )