In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, RidgeCV
from sklearn.metrics import mean_squared_error

In [2]:
FILE = '../data/Sample_Data.xlsx'
df = pd.read_excel(FILE)

In [3]:
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Label,Y
0,-0.178266,-1.232736,-0.755075,-0.025294,-0.935513,-2.612791,-0.453477,-0.064638,0.70588,0.478928,PS4,-32.65
1,-0.122282,1.082396,1.453829,-1.212121,-0.312355,1.908761,1.45645,0.226062,-0.299493,0.519629,PS4,13.11
2,-0.145936,2.110116,2.042481,-0.841299,0.444485,0.976353,1.874961,0.854823,0.387552,0.011759,PS4,28.92
3,1.789767,1.04523,1.472639,0.51791,-0.422547,2.162058,1.524488,0.37504,0.7415,0.567033,PS4,21.03
4,-0.433913,-0.917178,0.329103,0.369805,-1.812501,-0.581058,0.667508,1.207047,-1.817363,1.533617,PS4,-19.3


In [4]:
df_x = df.iloc[:,0:10]
Y = df['Y']
df_label = df['Label']
labels = df.Label.unique()

In [5]:
components = 1
xes_len = len(df_x.columns)

while components < xes_len:
    pca = PCA(n_components=components).fit(df_x)
    variance = pca.explained_variance_ratio_
    if sum(variance) >= 0.95:
        break
    
    components += 1
else:
    print('The variance did not reach 95%')
    
print('{} components are needed to reach 95%'.format(components))

new_x = pca.transform(df_x)

# Prepares a new dataframe with PCA
# Makes it easier to group code later
_tmp = {'X{}'.format(i): new_x[:, i] for i in range(components)}
pca_df = pd.DataFrame(_tmp)
pca_df['Label'] = df_label
pca_df['Y'] = Y

pca_df.head()

8 components are needed to reach 95%


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,Label,Y
0,0.477049,-0.465901,-0.997683,0.894913,0.387617,-0.826463,1.966999,-1.395952,PS4,-32.65
1,-3.17811,-0.91641,0.457351,-0.285217,-0.648249,-0.339764,-2.0597,0.226125,PS4,13.11
2,-4.440719,-0.657264,1.009417,0.628495,0.23996,-0.248195,-1.270892,-0.024737,PS4,28.92
3,-3.196147,-1.401235,-0.151968,-0.241175,0.472325,1.937969,-1.510854,0.923362,PS4,21.03
4,-0.717565,-1.487003,-1.57967,0.53017,-0.621695,-1.887639,0.336091,1.604094,PS4,-19.3


In [6]:
X_train, X_test, y_train, y_test = train_test_split(new_x, Y, test_size=0.3, random_state=1234)

In [7]:
def analysis(X_tr, y_tr, X_te, y_te):
    results = []

    models = [
        {'model': LinearRegression(), 'name': 'Linear Regression'},
        {'model': Ridge(alpha=0.05), 'name': 'Ridge'},
        {'model': Lasso(alpha=0.05), 'name': 'Lasso'},
        {'model': KNeighborsRegressor(n_neighbors=3), 'name': 'KNR'},
        {'model': RidgeCV(alphas=[0.01, 0.1, 10, 100], cv=5), 'name': 'RidgeCV'}
    ]

    for item in models:
        item['model'].fit(X_tr, y_tr)
        y_pre = item['model'].predict(X_te)
        results.append({
            'score': item['model'].score(X_te, y_te),
            'mse': mean_squared_error(y_te, y_pre),
            'model_name': item['name']
        })
        
    return results


In [58]:
all_results = analysis(X_train, y_train, X_test, y_test)

for result in all_results:
    print('Model used: {}'.format(result['model_name']))
    print('Prediction score on the test set: {}'.format(result['score']))
    print('Mean squared error: {}'.format(result['mse']))
    print('------------------')

Model used: Linear Regression
Prediction score on the test set: 0.8503895008012552
Mean squared error: 51.3190917187997
------------------
Model used: Ridge
Prediction score on the test set: 0.8504137590313194
Mean squared error: 51.3107707096441
------------------
Model used: Lasso
Prediction score on the test set: 0.851495669230774
Mean squared error: 50.93965605489279
------------------
Model used: KNR
Prediction score on the test set: 0.6999122122368339
Mean squared error: 102.9355077777778
------------------
Model used: RidgeCV
Prediction score on the test set: 0.8541774630822088
Mean squared error: 50.01975253629101
------------------


In [59]:
groupings = []
over_all_result = []

for i in range(5):
    over_all_result.append({
        'score': 0,
        'mse': 0,
        'model_name': None
    })
    
for l in labels:
    l2 = pca_df[pca_df['Label'] == l]
    print('Number of instances for group "{}": {}'.format(l, len(l2)))
    _new_x = l2.iloc[:,0:components]
    _Y = l2['Y']
    _df_label = l2['Label']
    
    _X_train, _X_test, _y_train, _y_test = train_test_split(_new_x, _Y, test_size=0.3, random_state=1234)
    
    _all_results = analysis(_X_train, _y_train, _X_test, _y_test)

    for i, result in enumerate(_all_results):
        over_all_result[i]['score'] += result['score']
        over_all_result[i]['mse'] += result['mse']
        over_all_result[i]['model_name'] = result['model_name']
                
print('\n\n\n')

for result in over_all_result:
    print('Model used: {}'.format(result['model_name']))
    print('Prediction score on the test set: {}'.format(result['score']))
    print('Mean squared error: {}'.format(result['mse']))
    print('------------------')

Number of instances for group "PS4": 101
Number of instances for group "Switch": 100
Number of instances for group "Xbox": 99




Model used: Linear Regression
Prediction score on the test set: 2.5709924012129086
Mean squared error: 146.16813360633063
------------------
Model used: Ridge
Prediction score on the test set: 2.570593240161482
Mean squared error: 146.30222369139454
------------------
Model used: Lasso
Prediction score on the test set: 2.5663388706448718
Mean squared error: 147.73903384013377
------------------
Model used: KNR
Prediction score on the test set: 1.6781562515325876
Mean squared error: 447.46194670250907
------------------
Model used: RidgeCV
Prediction score on the test set: 2.5580069891545087
Mean squared error: 150.65732648252998
------------------


