In [220]:
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

from sklearn.model_selection import train_test_split # for splitting the data into train and test samples
from sklearn.metrics import classification_report # for model evaluation metrics
from sklearn.preprocessing import OrdinalEncoder # for encoding categorical features from strings to number arrays

# Differnt types of Naive Bayes Classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.naive_bayes import BernoulliNB

In [221]:
df = pd.read_csv('/Users/navamongkoltongta/OneDrive - Assumption University/Year2.2/Data Science/Project/vgsales.csv')

In [222]:
df.drop(columns=['NA_Sales','EU_Sales','JP_Sales','Other_Sales'], inplace=True)

In [223]:
df.count()

Rank            16598
Name            16598
Platform        16598
Year            16327
Genre           16598
Publisher       16540
Global_Sales    16598
dtype: int64

In [224]:
df.dropna(how='any',subset=['Year','Publisher'],inplace=True)

In [225]:
df[df['Publisher'].isnull()]

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Global_Sales


In [226]:
df.isnull().sum()

Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
Global_Sales    0
dtype: int64

In [227]:
df.count()

Rank            16291
Name            16291
Platform        16291
Year            16291
Genre           16291
Publisher       16291
Global_Sales    16291
dtype: int64

In [228]:
df.describe()

Unnamed: 0,Rank,Year,Global_Sales
count,16291.0,16291.0,16291.0
mean,8290.190228,2006.405561,0.54091
std,4792.65445,5.832412,1.567345
min,1.0,1980.0,0.01
25%,4132.5,2003.0,0.06
50%,8292.0,2007.0,0.17
75%,12439.5,2010.0,0.48
max,16600.0,2020.0,82.74


In [229]:
# White wins flag (1=win vs. 0=not-win) - dependent (target) variable
df['Mean_global']=df['Global_Sales'].apply(lambda x: 'High' if x>=0.5377441 else 'Low')

In [230]:
dfhigh = df.loc[df['Mean_global'] == 'High']
dfhigh.head(5)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Global_Sales,Mean_global
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,82.74,High
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,40.24,High
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,35.82,High
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,33.0,High
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,31.37,High


In [231]:
dfhigh.Genre.value_counts()

Action          758
Sports          601
Shooter         412
Role-Playing    366
Misc            311
Racing          305
Platform        301
Fighting        219
Simulation      171
Puzzle           93
Adventure        86
Strategy         78
Name: Genre, dtype: int64

In [232]:
# dummies = pd.get_dummies(inputs.Genre)
# dummies.head(10)
# def genretoid(x):
#     if x == "Action":
#        return 1
#     elif x == "Sports":
#        return 2
#     elif x == "Shooter":
#        return 3
#     elif x == "Role-Playing":
#        return 4
#     elif x == "Misc":
#        return 5
#     elif x == "Racing":
#        return 6
#     elif x == "Platform":
#        return 7
#     elif x == "Fighting":
#        return 8
#     elif x == "Simulation":
#        return 9
#     elif x == "Puzzle":
#        return 10
#     elif x == "Adventure":
#        return 11
#     elif x == "Strategy":
#        return 12
dfhigh.loc[df['Genre'] == 'Action', 'GenreId'] = 1 
dfhigh.loc[df['Genre'] == 'Sports', 'GenreId'] = 2
dfhigh.loc[df['Genre'] == 'Shooter', 'GenreId'] = 3 
dfhigh.loc[df['Genre'] == 'Role-Playing', 'GenreId'] = 4 
dfhigh.loc[df['Genre'] == 'Misc', 'GenreId'] = 5 
dfhigh.loc[df['Genre'] == 'Racing', 'GenreId'] = 6 
dfhigh.loc[df['Genre'] == 'Platform', 'GenreId'] = 7 
dfhigh.loc[df['Genre'] == 'Fighting', 'GenreId'] = 8 
dfhigh.loc[df['Genre'] == 'Simulation', 'GenreId'] = 9 
dfhigh.loc[df['Genre'] == 'Puzzle', 'GenreId'] = 10 
dfhigh.loc[df['Genre'] == 'Adventure', 'GenreId'] = 11 
dfhigh.loc[df['Genre'] == 'Strategy', 'GenreId'] = 12 


# df['Mean_global']=df['Global_Sales'].apply(lambda x: 'High' if x>=0.5377441 else 'Low')
# dfhigh['GenreId']=dfhigh['Genre'].apply(genretoid)
dfhigh.sample(10)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,Global_Sales,Mean_global,GenreId
3362,3364,Skylanders SWAP Force,WiiU,2013.0,Platform,Activision,0.6,High,7.0
1504,1506,Deal or No Deal,Wii,2009.0,Misc,Zoo Games,1.31,High,5.0
511,512,Command & Conquer: Red Alert,PC,1996.0,Strategy,Virgin Interactive,2.85,High,12.0
890,892,Mortal Kombat vs. DC Universe,X360,2008.0,Fighting,Midway Games,1.93,High,8.0
3306,3308,Yakuza: Dead Souls,PS3,2011.0,Shooter,Sega,0.61,High,3.0
2908,2910,The Sims 2: Apartment Pets,DS,2008.0,Simulation,Electronic Arts,0.7,High,9.0
3003,3005,Country Dance,Wii,2011.0,Misc,Funbox Media,0.68,High,5.0
2934,2936,TimeSplitters 2,PS2,2002.0,Shooter,Eidos Interactive,0.7,High,3.0
543,544,Saints Row: The Third,X360,2011.0,Action,THQ,2.77,High,1.0
2604,2606,Momotarou Dentetsu 7,PS,1997.0,Simulation,Hudson Entertainment,0.79,High,9.0


In [233]:
inputs = dfhigh.drop('Global_Sales',axis='columns')
target = dfhigh.Global_Sales

In [234]:
# inputs = pd.concat([inputs,GenreId],axis='columns')
# inputs.head(5)


In [235]:
inputs.drop(['Name','Platform','Genre','Year','Publisher','Mean_global'],axis='columns',inplace=True)
inputs.head()

Unnamed: 0,Rank,GenreId
0,1,2.0
1,2,7.0
2,3,6.0
3,4,2.0
4,5,4.0


In [239]:
X = inputs.astype(int)
Y = target.astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.3)
model = GaussianNB()

In [240]:
X_train

Unnamed: 0,Rank,GenreId
1656,1658,1
3686,3688,8
1154,1156,1
3471,3473,4
1738,1740,10
...,...,...
2003,2005,1
3108,3110,5
1598,1600,4
2951,2953,1


In [241]:
model.fit(X_train,y_train)

GaussianNB()

In [242]:
model.score(X_test,y_test)

0.9756975697569757

In [243]:
model.predict(X_test[0:10])

array([1, 1, 1, 0, 1, 0, 1, 2, 1, 0])

In [115]:
model.predict_proba(X_test[:10])

array([[9.78947563e-006, 9.99890141e-001, 1.00069962e-004,
        4.11214529e-014, 9.45822530e-015, 3.67825900e-019,
        2.62114975e-018, 2.16877131e-045, 7.52205013e-103,
        4.83790809e-077, 6.62535211e-076, 9.54999327e-090,
        3.44555650e-260, 0.00000000e+000, 2.48089872e-052,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 3.18899035e-211, 0.00000000e+000,
        2.76478706e-185, 0.00000000e+000, 0.00000000e+000],
       [2.35361223e-001, 7.64638777e-001, 2.04655733e-032,
        1.43592678e-235, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
        0.00000000e+000, 0.00000000e+000, 0.00000000e+0

In [244]:
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train, y_train, cv=5)



array([0.98262548, 0.97683398, 0.97490347, 0.98069498, 0.98841699])