In [320]:
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.preprocessing import OrdinalEncoder # for encoding categorical features from strings to number arrays

# Differnt types of Naive Bayes Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import BernoulliNB

In [321]:
df = pd.read_csv('/Users/navamongkoltongta/OneDrive - Assumption University/Year2.2/Data Science/Project/vgsales.csv')

In [322]:
df.drop(columns=['NA_Sales','EU_Sales','JP_Sales','Other_Sales'], inplace=True)

In [323]:
df.count()

Rank            16598
Name            16598
Platform        16598
Year            16327
Genre           16598
Publisher       16540
Global_Sales    16598
dtype: int64

In [324]:
df.dropna(how='any',subset=['Year','Publisher'],inplace=True)

In [325]:
df[df['Publisher'].isnull()]

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Global_Sales


In [326]:
df.isnull().sum()

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
Global_Sales    0
dtype: int64

In [327]:
df.count()

Rank            16291
Name            16291
Platform        16291
Year            16291
Genre           16291
Publisher       16291
Global_Sales    16291
dtype: int64

In [328]:
df.describe()

Unnamed: 0,Rank,Year,Global_Sales
count,16291.0,16291.0,16291.0
mean,8290.190228,2006.405561,0.54091
std,4792.65445,5.832412,1.567345
min,1.0,1980.0,0.01
25%,4132.5,2003.0,0.06
50%,8292.0,2007.0,0.17
75%,12439.5,2010.0,0.48
max,16600.0,2020.0,82.74


In [329]:
# White wins flag (1=win vs. 0=not-win) - dependent (target) variable
df['Mean_global']=df['Global_Sales'].apply(lambda x: 'High' if x>=0.5377441 else 'Low')

In [330]:
dfhigh = df.loc[df['Mean_global'] == 'High']
dfhigh.head(5)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Global_Sales,Mean_global
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,82.74,High
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24,High
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82,High
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0,High
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37,High


In [331]:
dfhigh.Genre.value_counts()

Action          758
Sports          601
Shooter         412
Role-Playing    366
Misc            311
Racing          305
Platform        301
Fighting        219
Simulation      171
Puzzle           93
Adventure        86
Strategy         78
Name: Genre, dtype: int64

In [332]:
# dummies = pd.get_dummies(inputs.Genre)
# dummies.head(10)
# def genretoid(x):
#     if x == "Action":
#        return 1
#     elif x == "Sports":
#        return 2
#     elif x == "Shooter":
#        return 3
#     elif x == "Role-Playing":
#        return 4
#     elif x == "Misc":
#        return 5
#     elif x == "Racing":
#        return 6
#     elif x == "Platform":
#        return 7
#     elif x == "Fighting":
#        return 8
#     elif x == "Simulation":
#        return 9
#     elif x == "Puzzle":
#        return 10
#     elif x == "Adventure":
#        return 11
#     elif x == "Strategy":
#        return 12
dfhigh.loc[dfhigh['Genre'] == 'Action', 'GenreId'] = 1 
dfhigh.loc[dfhigh['Genre'] == 'Sports', 'GenreId'] = 2
dfhigh.loc[dfhigh['Genre'] == 'Shooter', 'GenreId'] = 3 
dfhigh.loc[dfhigh['Genre'] == 'Role-Playing', 'GenreId'] = 4 
dfhigh.loc[dfhigh['Genre'] == 'Misc', 'GenreId'] = 5 
dfhigh.loc[dfhigh['Genre'] == 'Racing', 'GenreId'] = 6 
dfhigh.loc[dfhigh['Genre'] == 'Platform', 'GenreId'] = 7 
dfhigh.loc[dfhigh['Genre'] == 'Fighting', 'GenreId'] = 8 
dfhigh.loc[dfhigh['Genre'] == 'Simulation', 'GenreId'] = 9 
dfhigh.loc[dfhigh['Genre'] == 'Puzzle', 'GenreId'] = 10 
dfhigh.loc[dfhigh['Genre'] == 'Adventure', 'GenreId'] = 11 
dfhigh.loc[dfhigh['Genre'] == 'Strategy', 'GenreId'] = 12 


# df['Mean_global']=df['Global_Sales'].apply(lambda x: 'High' if x>=0.5377441 else 'Low')
# dfhigh['GenreId']=dfhigh['Genre'].apply(genretoid)
dfhigh.sample(10)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Global_Sales,Mean_global,GenreId
852,854,Tennis,GB,1989.0,Sports,Nintendo,1.99,High,2.0
450,451,Dragon Ball Z: Budokai,PS2,2002.0,Fighting,Atari,3.09,High,8.0
1240,1242,Pac-Man World 2,PS2,2002.0,Action,Namco Bandai Games,1.51,High,1.0
530,531,WWF SmackDown! Just Bring It,PS2,2001.0,Fighting,THQ,2.79,High,8.0
3233,3235,MySims Racing,DS,2009.0,Racing,Electronic Arts,0.63,High,6.0
2921,2923,Prince of Persia: The Forgotten Sands,PS3,2010.0,Action,Ubisoft,0.7,High,1.0
3007,3009,Unreal Tournament III,PS3,2007.0,Shooter,Midway Games,0.67,High,3.0
2326,2328,Killzone: Liberation,PSP,2006.0,Shooter,Sony Computer Entertainment,0.89,High,3.0
3565,3567,Arena Football,PS2,2006.0,Sports,Electronic Arts,0.56,High,2.0
3360,3362,Mickey's Speedway USA,N64,2000.0,Racing,Nintendo,0.6,High,6.0


In [333]:
inputs = dfhigh.drop('Global_Sales',axis='columns')
target = dfhigh.Global_Sales

In [334]:
# inputs = pd.concat([inputs,GenreId],axis='columns')
# inputs.head(5)


In [335]:
inputs.drop(['Name','Platform','Genre','Year','Publisher','Mean_global'],axis='columns',inplace=True)
inputs.head()

Unnamed: 0,Rank,GenreId
0,1,2.0
1,2,7.0
2,3,6.0
3,4,2.0
4,5,4.0


In [336]:
X = inputs.astype(int)
Y = target.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3)
model = GaussianNB()

In [337]:
y_test

2645     0
1472     1
472      2
2522     0
14      22
        ..
2636     0
1172     1
3163     0
1988     1
192      5
Name: Global_Sales, Length: 1111, dtype: int64

In [338]:
model.fit(X_train,y_train)

GaussianNB()

In [339]:
model.score(X_test,y_test)

0.9837983798379838

In [340]:
model.predict(X_test[0:10])

array([0, 1, 2, 0, 1, 1, 2, 0, 0, 5])

In [341]:
model.predict_proba(X_test[:10])

array([[9.95267188e-001, 4.73281181e-003, 2.56161892e-069,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000],
       [1.09659707e-002, 9.89034029e-001, 3.06502348e-012,
        5.29158712e-102, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.000

In [342]:
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train, y_train, cv=5)



array([0.97876448, 0.99227799, 0.99034749, 0.98262548, 0.98069498])