In this notebook, we'll be attempting to see which console (XBox or PS4) is more popular in the states based on sales and year. Because there are multiple versions of the Playstation, and Xbox, we'll be mixing them together. 

In [233]:
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt

from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression

Reading the dataframe in from the file.

In [234]:
df = pd.read_csv("vgsales.csv")

Checking for unique values in the Platform column. 

In [235]:
df["Platform"].unique()

array(['Wii', 'NES', 'GB', 'DS', 'X360', 'PS3', 'PS2', 'SNES', 'GBA',
       '3DS', 'PS4', 'N64', 'PS', 'XB', 'PC', '2600', 'PSP', 'XOne', 'GC',
       'WiiU', 'GEN', 'DC', 'PSV', 'SAT', 'SCD', 'WS', 'NG', 'TG16',
       '3DO', 'GG', 'PCFX'], dtype=object)

Changing all Xbox consoles to 0, and all Playstation consoles to 1. 

In [236]:
Xbox = ["XB","X360","XOne"]
Playstation = ["PS4","PS3","PS2","PS"]

In [237]:
df["Platform"] = np.where(df["Platform"].isin(Xbox),0,df["Platform"])
df["Platform"] = np.where(df["Platform"].isin(Playstation),1,df["Platform"])

In [238]:
df.loc[df["Platform"] == 1]

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
16,17,Grand Theft Auto V,1,2013.0,Action,Take-Two Interactive,7.01,9.27,0.97,4.14,21.40
17,18,Grand Theft Auto: San Andreas,1,2004.0,Action,Take-Two Interactive,9.43,0.40,0.41,10.57,20.81
24,25,Grand Theft Auto: Vice City,1,2002.0,Action,Take-Two Interactive,8.41,5.49,0.47,1.78,16.15
28,29,Gran Turismo 3: A-Spec,1,2001.0,Racing,Sony Computer Entertainment,6.85,5.09,1.87,1.16,14.98
33,34,Call of Duty: Black Ops 3,1,2015.0,Shooter,Activision,5.77,5.81,0.35,2.31,14.24
...,...,...,...,...,...,...,...,...,...,...,...
16561,16564,Kanokon: Esuii,1,2008.0,Adventure,5pb,0.00,0.00,0.01,0.00,0.01
16570,16573,Farming 2017 - The Simulation,1,2016.0,Simulation,UIG Entertainment,0.00,0.01,0.00,0.00,0.01
16572,16575,Scarlett: Nichijou no Kyoukaisen,1,2008.0,Adventure,Kadokawa Shoten,0.00,0.00,0.01,0.00,0.01
16580,16583,Real Rode,1,2008.0,Adventure,Kadokawa Shoten,0.00,0.00,0.01,0.00,0.01


Getting the x and y values.

In [239]:
selected_df = df.loc[df["Platform"].isin([0,1])]
selected_df = selected_df.dropna()

Setting up the y values, either Xbox or Playstation

In [240]:
selected_y = selected_df["Platform"]
selected_y = selected_y.astype(int)

In [241]:
selected_x = selected_df[["Year","NA_Sales","EU_Sales","JP_Sales"]]

Checking for NaN values, and thus removing it, if found. 

In [242]:
selected_x.isnull().values.any()
selected_df.isnull().values.any()

False

Preprocessing data. 

In [243]:
x_scale = np.array(preprocessing.scale(selected_x))

Splitting up the data, and then reshaping so that we can use it to fit the logistic regression model. 

In [244]:
x_train,x_test,y_train,y_test = train_test_split(x_scale,selected_y)
y_2d_train = y_train.values.reshape(y_train.shape[0],1)
y_2d_test = y_test.values.reshape(y_test.shape[0],1)

All around function

In [245]:
def accuracies(x_train,y_train,x_test,y_test):
  yhat_train = logreg.predict(x_train)
  acc_train = np.mean(yhat_train == y_train)
  print("Accuracy on training data = %f" % acc_train)

  yhat_test = logreg.predict(x_test)
  acc_test = np.mean(yhat_test == y_test)
  print("Accuracy on test data = %f" % acc_test)

  return acc_train,acc_test


In [246]:
num = [.0001,.001,.01,.1,1,10]

In [247]:
ridge_test = []
ridge_train = []
lasso_test = []
lasso_train = []
minmax_test = []
minmax_train = []
pft_test = []
pft_train = []

# Logistic Regression with Ridge

In [248]:
for c in num:
  logreg = LogisticRegression(C = c)
  logreg.fit(x_train,y_2d_train)
  train,test = accuracies(x_train,y_2d_train,x_test,y_2d_test)
  ridge_test.append(test)
  ridge_train.append(train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.685603
Accuracy on test data = 0.694229
Accuracy on training data = 0.684436
Accuracy on test data = 0.693366


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.655929
Accuracy on test data = 0.664264
Accuracy on training data = 0.636558
Accuracy on test data = 0.641198


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.626529
Accuracy on test data = 0.630635
Accuracy on training data = 0.624400
Accuracy on test data = 0.627617


# Logistic Regression with Lasso

In [249]:
for c in num:
  logreg = LogisticRegression(penalty = "l1", solver="liblinear",C = c)
  logreg.fit(x_train,y_2d_train)
  train,test = accuracies(x_train,y_2d_train,x_test,y_2d_test)
  lasso_test.append(test)
  lasso_train.append(train)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.314397
Accuracy on test data = 0.305771
Accuracy on training data = 0.685603
Accuracy on test data = 0.694229
Accuracy on training data = 0.659157
Accuracy on test data = 0.669223
Accuracy on training data = 0.631612
Accuracy on test data = 0.634300


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.624949
Accuracy on test data = 0.628480
Accuracy on training data = 0.624193
Accuracy on test data = 0.627617


  y = column_or_1d(y, warn=True)


# Logistic Regression with MinMax Scaler

In [250]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_minmax = scaler.fit_transform(x_train)
x_minmax_test = scaler.fit_transform(x_test)
for c in [.001,.01,.1,1,10]:
  logreg = LogisticRegression(C = c)
  logreg.fit(x_minmax,y_2d_train)
  train, test =accuracies(x_minmax,y_2d_train,x_minmax_test,y_2d_test)
  minmax_test.append(test)
  minmax_train.append(train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.685603
Accuracy on test data = 0.694229
Accuracy on training data = 0.685603
Accuracy on test data = 0.694229


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.676399
Accuracy on test data = 0.668145
Accuracy on training data = 0.658264
Accuracy on test data = 0.651115


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.643221
Accuracy on test data = 0.627402


# Logistic Regression with Polynomial Feature Transformation

---



In [251]:
for degree in [1,2,3,4,5]:
  print("Degree: " ,degree)
  poly_features = preprocessing.PolynomialFeatures(degree = degree)
  x_poly = poly_features.fit_transform(x_train)
  x_poly_test = poly_features.fit_transform(x_test)
  for c in num:
    print("C: ", c)
    logreg = LogisticRegression(C = c,max_iter = 1000)
    logreg.fit(x_poly,y_2d_train)
    train,test = accuracies(x_poly,y_2d_train,x_poly_test,y_2d_test)
    pft_test.append((test,degree,c))
    pft_train.append((train,degree,c))

  y = column_or_1d(y, warn=True)


Degree:  1
C:  0.0001
Accuracy on training data = 0.685603
Accuracy on test data = 0.694229
C:  0.001


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.684436
Accuracy on test data = 0.693366
C:  0.01
Accuracy on training data = 0.655929
Accuracy on test data = 0.664264
C:  0.1


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.636558
Accuracy on test data = 0.641198
C:  1
Accuracy on training data = 0.626529
Accuracy on test data = 0.630635
C:  10


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.624400
Accuracy on test data = 0.627617
Degree:  2
C:  0.0001
Accuracy on training data = 0.684642
Accuracy on test data = 0.693366
C:  0.001


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.681138
Accuracy on test data = 0.690995
C:  0.01
Accuracy on training data = 0.664721
Accuracy on test data = 0.676121
C:  0.1


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.621034
Accuracy on test data = 0.629126
C:  1


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.601525
Accuracy on test data = 0.603042
C:  10


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.598984
Accuracy on test data = 0.600240
Degree:  3
C:  0.0001
Accuracy on training data = 0.682650
Accuracy on test data = 0.692504
C:  0.001


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.677086
Accuracy on test data = 0.688193
C:  0.01


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.665202
Accuracy on test data = 0.675474
C:  0.1


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.627903
Accuracy on test data = 0.638612
C:  1


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.607158
Accuracy on test data = 0.614899
C:  10


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.598709
Accuracy on test data = 0.605629
Degree:  4
C:  0.0001


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.680726
Accuracy on test data = 0.690348
C:  0.001


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.676467
Accuracy on test data = 0.688408
C:  0.01


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.659089
Accuracy on test data = 0.668360
C:  0.1


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.619591
Accuracy on test data = 0.624815
C:  1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.584971
Accuracy on test data = 0.588815
C:  10


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.580094
Accuracy on test data = 0.582132
Degree:  5
C:  0.0001
Accuracy on training data = 0.679490
Accuracy on test data = 0.689917
C:  0.001


  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.673514
Accuracy on test data = 0.686253
C:  0.01


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.660050
Accuracy on test data = 0.669223
C:  0.1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.612653
Accuracy on test data = 0.617055
C:  1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  y = column_or_1d(y, warn=True)


Accuracy on training data = 0.593214
Accuracy on test data = 0.596144
C:  10
Accuracy on training data = 0.584147
Accuracy on test data = 0.583857


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [254]:
print(pft_test)
print(pft_train)

best_res_test = 0
best_res_train = 0
best_degree, best_c = 0,0

for num in pft_test:
  if num[0] > best_res_test:
    best_res_test = num[0]
    best_degree,best_c = num[1],num[2]

print(best_degree,best_c)

[(0.6942286348501665, 1, 0.0001), (0.6933663545622634, 1, 0.001), (0.6642643948455348, 1, 0.01), (0.6411983971441277, 1, 0.1), (0.630635463617315, 1, 1), (0.6276174826096543, 1, 10), (0.6933663545622634, 2, 0.0001), (0.69099508377053, 2, 0.001), (0.676120748804202, 2, 0.01), (0.6291264731134847, 2, 0.1), (0.6030424944044168, 2, 1), (0.6002400834687319, 2, 10), (0.6925040742743603, 3, 0.0001), (0.688192672834845, 3, 0.001), (0.6754740385882747, 3, 0.01), (0.6386115562804184, 3, 0.1), (0.6148988483630841, 3, 1), (0.605629335268126, 3, 10), (0.6903483735546027, 4, 0.0001), (0.6884082429068208, 4, 0.001), (0.6683602262130743, 4, 0.01), (0.6248150716739693, 4, 0.1), (0.5888148696540162, 4, 1), (0.5821321974227675, 4, 10), (0.6899172334106511, 5, 0.0001), (0.6862525421870631, 5, 0.001), (0.6692225065009775, 5, 0.01), (0.6170545490828417, 5, 0.1), (0.5961442521011923, 5, 1), (0.5838567579985735, 5, 10)]
[(0.6856032568467801, 1, 0.0001), (0.6844355087467079, 1, 0.001), (0.6559287168920026, 1, 