In [69]:
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.preprocessing import OrdinalEncoder # for encoding categorical features from strings to number arrays

# Differnt types of Naive Bayes Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import BernoulliNB

In [70]:
df = pd.read_csv('/Users/navamongkoltongta/OneDrive - Assumption University/Year2.2/Data Science/Project/vgsales.csv')

In [71]:
df.drop(columns=['NA_Sales','EU_Sales','JP_Sales','Other_Sales'], inplace=True)

In [72]:
df.count()

Rank            16598
Name            16598
Platform        16598
Year            16327
Genre           16598
Publisher       16540
Global_Sales    16598
dtype: int64

In [73]:
df.dropna(how='any',subset=['Year','Publisher'],inplace=True)

In [74]:
df[df['Publisher'].isnull()]

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Global_Sales


In [75]:
df.isnull().sum()

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
Global_Sales    0
dtype: int64

In [76]:
df.count()

Rank            16291
Name            16291
Platform        16291
Year            16291
Genre           16291
Publisher       16291
Global_Sales    16291
dtype: int64

In [77]:
df.describe()

Unnamed: 0,Rank,Year,Global_Sales
count,16291.0,16291.0,16291.0
mean,8290.190228,2006.405561,0.54091
std,4792.65445,5.832412,1.567345
min,1.0,1980.0,0.01
25%,4132.5,2003.0,0.06
50%,8292.0,2007.0,0.17
75%,12439.5,2010.0,0.48
max,16600.0,2020.0,82.74


In [78]:
# White wins flag (1=win vs. 0=not-win) - dependent (target) variable
df['Mean_global']=df['Global_Sales'].apply(lambda x: 'High' if x>=0.5377441 else 'Low')

In [79]:
dfhigh = df.loc[df['Mean_global'] == 'High']
dfhigh.head(5)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Global_Sales,Mean_global
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,82.74,High
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24,High
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82,High
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0,High
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37,High


In [80]:
dfhigh.Genre.value_counts()

Action          758
Sports          601
Shooter         412
Role-Playing    366
Misc            311
Racing          305
Platform        301
Fighting        219
Simulation      171
Puzzle           93
Adventure        86
Strategy         78
Name: Genre, dtype: int64

In [81]:
inputs = dfhigh.drop('Global_Sales',axis='columns')
target = dfhigh.Global_Sales

In [82]:
dummies = pd.get_dummies(inputs.Genre)
dummies.head(10)

Unnamed: 0,Action,Adventure,Fighting,Misc,Platform,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0,0,0,0,0
7,0,0,0,1,0,0,0,0,0,0,0,0
8,0,0,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,0,0


In [83]:
inputs = pd.concat([inputs,dummies],axis='columns')
inputs.head(5)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Mean_global,Action,Adventure,Fighting,Misc,Platform.1,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,High,0,0,0,0,0,0,0,0,0,0,1,0
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,High,0,0,0,0,1,0,0,0,0,0,0,0
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,High,0,0,0,0,0,0,1,0,0,0,0,0
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,High,0,0,0,0,0,0,0,0,0,0,1,0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,High,0,0,0,0,0,0,0,1,0,0,0,0


In [84]:
inputs.drop(['Name','Platform','Year','Genre','Publisher','Mean_global'],axis='columns',inplace=True)
inputs.head()

Unnamed: 0,Rank,Action,Adventure,Fighting,Misc,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
0,1,0,0,0,0,0,0,0,0,0,1,0
1,2,0,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,1,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,1,0
4,5,0,0,0,0,0,0,1,0,0,0,0


In [106]:
X = inputs.astype(int)
Y = target.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3)
model = GaussianNB()

In [107]:
X_train

Unnamed: 0,Rank,Action,Adventure,Fighting,Misc,Puzzle,Racing,Role-Playing,Shooter,Simulation,Sports,Strategy
3165,3167,0,0,0,0,0,1,0,0,0,0,0
1960,1962,0,0,0,0,0,0,0,0,0,1,0
1472,1474,0,0,1,0,0,0,0,0,0,0,0
2230,2232,0,0,0,0,0,0,0,0,1,0,0
1130,1132,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3703,3705,0,0,0,0,0,1,0,0,0,0,0
494,495,0,0,0,0,0,0,0,1,0,0,0
3415,3417,0,0,0,0,0,0,0,0,0,1,0
2898,2900,0,0,0,0,0,0,0,0,0,1,0


In [108]:
model.fit(X_train,y_train)

GaussianNB()

In [118]:
model.score(X_test,y_test)

0.9081908190819082

In [114]:
model.predict(X_test[0:10])

array([1, 1, 1, 0, 1, 1, 0, 0, 0, 0])

In [115]:
model.predict_proba(X_test[:10])

array([[9.78947563e-006, 9.99890141e-001, 1.00069962e-004,
        4.11214529e-014, 9.45822530e-015, 3.67825900e-019,
        2.62114975e-018, 2.16877131e-045, 7.52205013e-103,
        4.83790809e-077, 6.62535211e-076, 9.54999327e-090,
        3.44555650e-260, 0.00000000e+000, 2.48089872e-052,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 3.18899035e-211, 0.00000000e+000,
        2.76478706e-185, 0.00000000e+000, 0.00000000e+000],
       [2.35361223e-001, 7.64638777e-001, 2.04655733e-032,
        1.43592678e-235, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+0

In [116]:
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train, y_train, cv=5)



array([0.91119691, 0.9015444 , 0.9034749 , 0.9015444 , 0.91698842])