## Data Manipulation and Cleaning

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob

In [2]:
path = os.getcwd()
csv_files = glob.glob(os.path.join(path,"Resources/*.csv"))

In [3]:
raw_data = {}

for file in csv_files:
    df_temp = pd.read_csv(file)
        
    file_name = file.split("\\")[-1]
    name = file_name.split(".")[0]
    
    # all files except the user_ratings one need to have their index set to BGGID to make merging simpler
    if name!="user_ratings":
        df_temp.set_index("BGGId",inplace=True)
    
    raw_data[name] = df_temp

In [5]:
merged_df = pd.DataFrame()
files = list(raw_data.keys())
to_add = files[:5]
to_add.remove("ratings_distribution")

for file in to_add:
    merged_df = pd.concat([merged_df,raw_data[file]],axis=1)
    
merged_df.head()

ValueError: list.remove(x): x not in list

In [5]:
merged_df.describe()

Unnamed: 0,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,BestPlayers,...,Theme_Fashion,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification
count,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,16395.0,16034.0,21925.0,...,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0,21925.0
mean,1985.494914,1.982131,6.424922,5.685673,1.516374,2.007343,5.707868,10.004391,216.461819,0.311517,...,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05,9.1e-05
std,212.486214,0.848983,0.932477,0.365311,0.285578,0.693093,15.014643,3.269157,236.595136,1.067002,...,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551,0.009551
min,-3500.0,0.0,1.04133,3.57481,0.196023,0.0,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2001.0,1.3333,5.83696,5.5103,1.32072,2.0,4.0,8.0,24.027778,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2011.0,1.9688,6.45395,5.54654,1.47688,2.0,4.0,10.0,138.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2017.0,2.5252,7.05245,5.67989,1.66547,2.0,6.0,12.0,351.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2021.0,5.0,9.91429,8.51488,4.27728,10.0,999.0,21.0,1757.0,15.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
raw_data["games"].dtypes

Name                    object
Description             object
YearPublished            int64
GameWeight             float64
AvgRating              float64
BayesAvgRating         float64
StdDev                 float64
MinPlayers               int64
MaxPlayers               int64
ComAgeRec              float64
LanguageEase           float64
BestPlayers              int64
GoodPlayers             object
NumOwned                 int64
NumWant                  int64
NumWish                  int64
NumWeightVotes           int64
MfgPlaytime              int64
ComMinPlaytime           int64
ComMaxPlaytime           int64
MfgAgeRec                int64
NumUserRatings           int64
NumComments              int64
NumAlternates            int64
NumExpansions            int64
NumImplementations       int64
IsReimplementation       int64
Family                  object
Kickstarted              int64
ImagePath               object
Rank:boardgame           int64
Rank:strategygames       int64
Rank:abs

In [7]:
colPop = ["MinPlayers","MaxPlayers","ComAgeRec","BestPlayers","NumOwned","NumWant","NumWish","NumWeightVotes","MfgPlaytime","ComMinPlaytime","ComMaxPlaytime","MfgAgeRec","NumUserRatings","NumComments","NumAlternates","NumExpansions","NumImplementations"]
games = raw_data["games"]
for colName in colPop:
    populated = games.loc[games[colName] > 0]
    print(f"Total populated in {colName}: {len(populated)}; as percentage: {len(populated)/len(games)}")

Total populated in MinPlayers: 21875; as percentage: 0.9977194982896237
Total populated in MaxPlayers: 21752; as percentage: 0.9921094640820981
Total populated in ComAgeRec: 16395; as percentage: 0.7477765108323832
Total populated in BestPlayers: 1981; as percentage: 0.09035347776510833
Total populated in NumOwned: 21924; as percentage: 0.9999543899657924
Total populated in NumWant: 19987; as percentage: 0.9116077537058153
Total populated in NumWish: 21819; as percentage: 0.9951653363740023
Total populated in NumWeightVotes: 21419; as percentage: 0.976921322690992
Total populated in MfgPlaytime: 21145; as percentage: 0.96442417331813
Total populated in ComMinPlaytime: 21273; as percentage: 0.9702622576966933
Total populated in ComMaxPlaytime: 21145; as percentage: 0.96442417331813
Total populated in MfgAgeRec: 20600; as percentage: 0.9395667046750285
Total populated in NumUserRatings: 21925; as percentage: 1.0
Total populated in NumComments: 0; as percentage: 0.0
Total populated in Num

Based on the percentage of populated columns in the cell above, we propose to remove the columns BestPlayers and NumComments based on lack of data. NumAlternates, NumExpansions, and NumImplementations also have low percentage of games populated, but they carry value as they indicate the number of alternate versions, expansions, and implementations the games have. If a game has an expansion pass, that might be an incentive for folks to try it out or purchase it (and maybe give it a different rating than they would have otherwise).

In [8]:
merged_df.drop(columns=["BestPlayers","NumComments"],inplace=True)

In [9]:
## confirming that BestPlayers and NumComments were removed
merged_df.columns[10:]

Index(['LanguageEase', 'GoodPlayers', 'NumOwned', 'NumWant', 'NumWish',
       'NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime', 'ComMaxPlaytime',
       'MfgAgeRec',
       ...
       'Theme_Fashion', 'Theme_Geocaching', 'Theme_Ecology', 'Theme_Chernobyl',
       'Theme_Photography', 'Theme_French Foreign Legion',
       'Theme_Cruise ships', 'Theme_Apache Tribes', 'Theme_Rivers',
       'Theme_Flags identification'],
      dtype='object', length=419)

In [10]:
print(merged_df[['GoodPlayers','Family','ImagePath']])

            GoodPlayers                         Family  \
BGGId                                                    
1            ['4', '5']    Classic Line (Valley Games)   
2                    []                            NaN   
3       ['2', '3', '4']  Euro Classics (Reiner Knizia)   
4                    []                            NaN   
5       ['3', '4', '5']                   3M Bookshelf   
...                 ...                            ...   
347146               []                            NaN   
347521               []                            NaN   
348955               []                            NaN   
349131               []                            NaN   
349161               []                            NaN   

                                                ImagePath  
BGGId                                                      
1       https://cf.geekdo-images.com/rpwCZAjYLD940NWwP...  
2       https://cf.geekdo-images.com/oQYhaJx5Lg3KcGis2...  
3    

Based on the cell above, the columns Family, ImagePath and GoodPlayers can be removed as they do not add value to the model.

In [11]:
goodPlayers = merged_df["GoodPlayers"]
goodPlayers.nunique()

78

In [12]:
merged_df.drop(columns=["Family","ImagePath","GoodPlayers"],inplace=True)

We don't want to include the Bayes Average Rating as a potential variable for predicting the average rating, so remove it. Similarly, StdDev will be removed

In [13]:
merged_df.drop(columns=["BayesAvgRating","StdDev"],inplace=True)

In [14]:
bad = raw_data["games"].loc[raw_data["games"]["AvgRating"] <6.5]
good = raw_data["games"].loc[raw_data["games"]["AvgRating"]>=6.5]
print(f"Number of games with rating less than 6.5: {len(bad)}\nNumber of games with rating greater than or equal to 6.5: {len(good)}")

Number of games with rating less than 6.5: 11375
Number of games with rating greater than or equal to 6.5: 10550


In [15]:
merged_df.head()

Unnamed: 0_level_0,Name,Description,YearPublished,GameWeight,AvgRating,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,NumOwned,...,Theme_Fashion,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Die Macher,die macher game seven sequential political rac...,1986,4.3206,7.61428,3,5,14.366667,1.395833,7498,...,0,0,0,0,0,0,0,0,0,0
2,Dragonmaster,dragonmaster tricktaking card game base old ga...,1981,1.963,6.64537,3,4,,27.0,1285,...,0,0,0,0,0,0,0,0,0,0
3,Samurai,samurai set medieval japan player compete gain...,1998,2.4859,7.45601,2,4,9.307692,1.0,15578,...,0,0,0,0,0,0,0,0,0,0
4,Tal der Könige,triangular box luxurious large block tal der k...,1992,2.6667,6.60006,2,4,13.0,256.0,638,...,0,0,0,0,0,0,0,0,0,0
5,Acquire,acquire player strategically invest business t...,1964,2.5031,7.33861,2,6,11.410256,21.152941,23735,...,0,0,0,0,0,0,0,0,0,0


In [16]:
merged_df.columns[merged_df.isna().any()].tolist()

['Description', 'ComAgeRec', 'LanguageEase']

In [17]:
merged_df[["Description","ComAgeRec","LanguageEase"]].isna().sum()

Description        1
ComAgeRec       5530
LanguageEase    5891
dtype: int64

Description is just a blurb about the game- it will be removed later on, so no need to worry about the single NA row. I'll set it to 0 for now as it won't affect anything. ComAgeRec is the commuity voted recommended age. I'd like to use it in the model since it could provide some value. My suggestion is to fill in the NA values here with the manufacturer recommended age (available in MfgAgeRec column). LanguageEase defines how easy it is for people who do not speak the language in which the game was written to pick up a game. The higher the number, the harder it is for non-native speakers to understand/pick up the game. If the game does not have a score here, we can infer 0.

In [18]:
merged_df.loc[merged_df["Description"].isna()]

Unnamed: 0_level_0,Name,Description,YearPublished,GameWeight,AvgRating,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,NumOwned,...,Theme_Fashion,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
170984,Timeline: Sports et Loisirs,,2014,1.0,6.64387,2,8,,657.0,170,...,0,0,0,0,0,0,0,0,0,0


In [19]:
merged_df["Description"].fillna(0, inplace=True)

In [20]:
merged_df.loc[[170984]]

Unnamed: 0_level_0,Name,Description,YearPublished,GameWeight,AvgRating,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,NumOwned,...,Theme_Fashion,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
170984,Timeline: Sports et Loisirs,0,2014,1.0,6.64387,2,8,,657.0,170,...,0,0,0,0,0,0,0,0,0,0


In [21]:
merged_df["ComAgeRec"].fillna(merged_df["MfgAgeRec"], inplace=True)

In [22]:
merged_df["LanguageEase"].fillna(0, inplace=True)

## Model

In [23]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

#### Scaling

In [24]:
model_df = merged_df.copy(deep=True)
model_df["AvgRating"] = model_df["AvgRating"].apply(lambda x: 1 if x > 6.5 else 0)

In [25]:
## won't need name and description for creating the model
model_df.drop(columns=["Name","Description"], inplace=True)

In [26]:
model_df_scaled = StandardScaler().fit_transform(model_df[['YearPublished', 'GameWeight', 'MinPlayers', 'MaxPlayers',
       'ComAgeRec', 'LanguageEase','NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime',
       'ComMaxPlaytime', 'MfgAgeRec', 'NumUserRatings',
       'NumAlternates', 'NumExpansions',
       'Rank:boardgame', 'Rank:strategygames', 'Rank:abstracts',
       'Rank:familygames', 'Rank:thematic', 'Rank:cgs', 'Rank:wargames',
       'Rank:partygames', 'Rank:childrensgames', 'Cat:Thematic',
       'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract',
       'Cat:Party', 'Cat:Childrens','NumOwned','NumWant','NumWish','NumImplementations']])

In [27]:
model_df_transformed = pd.DataFrame(model_df_scaled, columns=['YearPublished', 'GameWeight', 'MinPlayers', 'MaxPlayers',
       'ComAgeRec', 'LanguageEase','NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime',
       'ComMaxPlaytime', 'MfgAgeRec', 'NumUserRatings',
       'NumAlternates', 'NumExpansions',
       'Rank:boardgame', 'Rank:strategygames', 'Rank:abstracts',
       'Rank:familygames', 'Rank:thematic', 'Rank:cgs', 'Rank:wargames',
       'Rank:partygames', 'Rank:childrensgames', 'Cat:Thematic',
       'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract',
       'Cat:Party', 'Cat:Childrens','NumOwned','NumWant','NumWish','NumImplementations'])
model_df_transformed.set_index(model_df.index, inplace=True)
model_df_transformed.head()

Unnamed: 0_level_0,YearPublished,GameWeight,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,NumWeightVotes,MfgPlaytime,ComMinPlaytime,ComMaxPlaytime,...,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens,NumOwned,NumWant,NumWish,NumImplementations
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.002377,2.754499,1.432245,-0.047146,1.341048,-0.700713,3.458048,0.282239,0.397204,0.282239,...,-0.438064,-0.34367,-0.118379,-0.231474,-0.173402,-0.204609,1.139054,3.917263,2.296305,-0.364041
2,-0.021154,-0.022535,1.432245,-0.113749,0.634536,-0.586369,0.021967,-0.114253,-0.075869,-0.114253,...,-0.438064,-0.34367,-0.118379,-0.231474,-0.173402,-0.204609,-0.034539,0.258494,-0.047507,1.994239
3,0.058853,0.593393,-0.010595,-0.113749,-0.169188,-0.70248,6.811507,-0.057611,-0.075869,-0.057611,...,-0.438064,-0.34367,-0.118379,-0.231474,-0.173402,-0.204609,2.66531,6.458785,4.085872,0.815099
4,0.030615,0.806359,-0.010595,-0.113749,0.933062,0.436308,-0.094675,-0.057611,-0.008287,-0.057611,...,-0.438064,-0.34367,-0.118379,-0.231474,-0.173402,-0.204609,-0.156753,0.104979,-0.133751,-0.364041
5,-0.101161,0.613653,-0.010595,0.019457,0.458483,-0.612481,7.56482,-0.00097,0.059295,-0.00097,...,-0.438064,-0.34367,-0.118379,-0.231474,-0.173402,-0.204609,4.206111,4.318107,3.097869,-0.364041


In [28]:
model_df_remaining = model_df.drop(columns=['YearPublished', 'GameWeight', 'MinPlayers', 'MaxPlayers',
       'ComAgeRec', 'LanguageEase','NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime',
       'ComMaxPlaytime', 'MfgAgeRec', 'NumUserRatings',
       'NumAlternates', 'NumExpansions',
       'Rank:boardgame', 'Rank:strategygames', 'Rank:abstracts',
       'Rank:familygames', 'Rank:thematic', 'Rank:cgs', 'Rank:wargames',
       'Rank:partygames', 'Rank:childrensgames', 'Cat:Thematic',
       'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract',
       'Cat:Party', 'Cat:Childrens','NumOwned','NumWant','NumWish','NumImplementations'])
model_df_remaining.head()

Unnamed: 0_level_0,AvgRating,IsReimplementation,Kickstarted,Alliances,Area Majority / Influence,Auction/Bidding,Dice Rolling,Hand Management,Simultaneous Action Selection,Trick-taking,...,Theme_Fashion,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,1,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
model_df_final = model_df_transformed.join(model_df_remaining)
model_df_final.head()

Unnamed: 0_level_0,YearPublished,GameWeight,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,NumWeightVotes,MfgPlaytime,ComMinPlaytime,ComMaxPlaytime,...,Theme_Fashion,Theme_Geocaching,Theme_Ecology,Theme_Chernobyl,Theme_Photography,Theme_French Foreign Legion,Theme_Cruise ships,Theme_Apache Tribes,Theme_Rivers,Theme_Flags identification
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.002377,2.754499,1.432245,-0.047146,1.341048,-0.700713,3.458048,0.282239,0.397204,0.282239,...,0,0,0,0,0,0,0,0,0,0
2,-0.021154,-0.022535,1.432245,-0.113749,0.634536,-0.586369,0.021967,-0.114253,-0.075869,-0.114253,...,0,0,0,0,0,0,0,0,0,0
3,0.058853,0.593393,-0.010595,-0.113749,-0.169188,-0.70248,6.811507,-0.057611,-0.075869,-0.057611,...,0,0,0,0,0,0,0,0,0,0
4,0.030615,0.806359,-0.010595,-0.113749,0.933062,0.436308,-0.094675,-0.057611,-0.008287,-0.057611,...,0,0,0,0,0,0,0,0,0,0
5,-0.101161,0.613653,-0.010595,0.019457,0.458483,-0.612481,7.56482,-0.00097,0.059295,-0.00097,...,0,0,0,0,0,0,0,0,0,0


#### Creating the Model

In [30]:
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

In [31]:
y = model_df_final["AvgRating"]
X = model_df_final.drop(columns=["AvgRating"])

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [33]:
logistic_regression_model = LogisticRegression(random_state=1, solver="saga", max_iter=1000)

In [34]:
lr_model = logistic_regression_model.fit(X_train, y_train)



In [35]:
# Make a prediction using the testing data
training_predictions = lr_model.predict(X_train)
testing_predictions = logistic_regression_model.predict(X_test)

In [36]:
BAS = balanced_accuracy_score(y_test, testing_predictions)
print("Accuracy Score is {:.2f}".format(BAS))

Accuracy Score is 0.82


In [37]:
test_matrix = confusion_matrix(y_test, testing_predictions)
print(test_matrix)

[[2361  429]
 [ 555 2137]]


In [38]:
# Print the classification report for the model
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.81      0.85      0.83      2790
           1       0.83      0.79      0.81      2692

    accuracy                           0.82      5482
   macro avg       0.82      0.82      0.82      5482
weighted avg       0.82      0.82      0.82      5482

