In [325]:
#!pip install pandas
import pandas as pd
#!pip install numpy
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from sklearn.preprocessing import LabelEncoder


In [326]:
import pickle

# Function to save a DataFrame
def save_dataframe(df, filename):
    """Save a DataFrame to a pickle file."""
    with open(filename, 'wb') as file:
        pickle.dump(df, file)
    print(f"DataFrame saved to {filename}")

# Function to load a DataFrame
def load_dataframe(filename):
    """Load a DataFrame from a pickle file."""
    with open(filename, 'rb') as file:
        df = pickle.load(file)
    print(f"DataFrame loaded from {filename}")
    return df

In [327]:
def missing_values(df):
    # Check and display the count of missing values per column
    missing_counts = df.isnull().sum()
    
    # Filter only columns that have missing values
    missing_counts = missing_counts[missing_counts > 0]
    
    # Display the missing values count
    print("Missing Values Count Per Column:")
    print(missing_counts)


def show_value_counts(df, col_name, top_n=10):
    if col_name not in df.columns:
        print(f"Column '{col_name}' not found in DataFrame.")
        return
    
    print(f"Value counts for column: {col_name}")
    print(df[col_name].value_counts().head(top_n))

def missing_stats(df):
    # Calculate missing values statistics
    print(pd.DataFrame({
        'Missing Count': df.isna().sum(),  # Count of NaN values per column
        'Total Rows': len(df),  # Total number of rows
        'Missing Percentage': (df.isna().sum() / len(df)) * 100  # Percentage of NaN per column
    }))


In [328]:

def convert_LabelEncoder(df, column_name):
    # Initialize the LabelEncoder
    le = LabelEncoder()
    # Fit and transform the column and update the DataFrame column
    df[column_name] = le.fit_transform(df[column_name])
    return df[column_name]



In [329]:
# Load CSV
df = pd.read_csv(r"C:\Users\oferg\Desktop\DS\ML-Project\dataframe.csv")


In [330]:
df.shape

(53949, 34)

In [331]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53949 entries, 0 to 53948
Data columns (total 34 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   League       53949 non-null  object 
 1   Season       53949 non-null  object 
 2   Stage        53949 non-null  object 
 3   Player       53949 non-null  object 
 4   Team         53938 non-null  object 
 5   GP           53949 non-null  int64  
 6   MIN          53949 non-null  float64
 7   FGM          53949 non-null  int64  
 8   FGA          53949 non-null  int64  
 9   3PM          53949 non-null  int64  
 10  3PA          53949 non-null  int64  
 11  FTM          53949 non-null  int64  
 12  FTA          53949 non-null  int64  
 13  TOV          53949 non-null  int64  
 14  PF           53949 non-null  int64  
 15  ORB          53949 non-null  int64  
 16  DRB          53949 non-null  int64  
 17  REB          53949 non-null  int64  
 18  AST          53949 non-null  int64  
 19  STL 

In [332]:
df.describe()

Unnamed: 0,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,REB,AST,STL,BLK,PTS,birth_year,height_cm,weight,weight_kg,draft_round,draft_pick
count,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53949.0,53631.0,53875.0,49385.0,49385.0,10136.0,10136.0
mean,30.313574,752.431404,113.200541,245.094942,28.468535,80.738383,56.297299,76.012716,47.260487,70.10104,34.325048,90.601216,124.926264,62.78691,26.715398,10.492057,311.178372,1986.361675,197.445123,210.309527,95.422193,1.38753,14.053177
std,17.849616,534.216679,100.164033,212.155076,30.673395,80.672208,59.24065,76.172698,37.398461,45.62977,37.003235,84.6291,117.293566,73.184287,22.077459,18.199867,271.81159,6.637023,8.728587,26.128059,11.851299,0.508224,8.643064
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1961.0,160.0,130.0,59.0,1.0,1.0
25%,17.0,380.9,48.0,109.0,6.0,20.0,20.0,28.0,21.0,37.0,11.0,37.0,50.0,20.0,11.0,1.0,134.0,1982.0,191.0,190.0,86.0,1.0,6.0
50%,29.0,663.0,89.0,196.0,20.0,61.0,40.0,56.0,39.0,64.0,23.0,68.0,93.0,41.0,21.0,4.0,247.0,1987.0,198.0,209.0,95.0,1.0,13.0
75%,37.0,954.0,145.0,310.0,41.0,117.0,73.0,99.0,63.0,91.0,44.0,114.0,159.0,78.0,36.0,12.0,399.0,1991.0,203.0,229.0,104.0,2.0,22.0
max,85.0,3485.0,978.0,2173.0,402.0,1028.0,756.0,972.0,464.0,371.0,440.0,894.0,1247.0,925.0,225.0,307.0,2832.0,2004.0,229.0,375.0,170.0,7.0,30.0


# Data Protocol

In [334]:
# Type of values
df.dtypes.to_excel("df_datatype.xlsx", sheet_name='data_type')

# Maximum values (only numeric columns)
df.select_dtypes(include=['number']).max().to_excel("max_df.xlsx", sheet_name='max')

# Minimum values (only numeric columns)
df.select_dtypes(include=['number']).min().to_excel("min_df.xlsx", sheet_name='min')

# Missing values
df.isnull().sum(axis=0).to_excel("NA_df.xlsx", sheet_name='NA')

# Unique values count
df.nunique().to_excel("unique_df.xlsx", sheet_name='unique')

print("Excel files exported successfully.")


Excel files exported successfully.


# Missing Data

In [336]:
missing_stats(df)

             Missing Count  Total Rows  Missing Percentage
League               0         53949          0.000000    
Season               0         53949          0.000000    
Stage                0         53949          0.000000    
Player               0         53949          0.000000    
Team                11         53949          0.020390    
GP                   0         53949          0.000000    
MIN                  0         53949          0.000000    
FGM                  0         53949          0.000000    
FGA                  0         53949          0.000000    
3PM                  0         53949          0.000000    
3PA                  0         53949          0.000000    
FTM                  0         53949          0.000000    
FTA                  0         53949          0.000000    
TOV                  0         53949          0.000000    
PF                   0         53949          0.000000    
ORB                  0         53949          0.000000  

# Descriptive Statistics

In [338]:
# Install AutoViz if not installed
# !pip install autoviz

from autoviz.AutoViz_Class import AutoViz_Class
import pandas as pd
%matplotlib inline

# Initialize AutoViz
AV = AutoViz_Class()

# Load DataFrame before passing to AutoViz
#filename = r'C:\Users\oferg\Desktop\DS\ML-Project\dataframe.csv'
#sep = ','  # Adjust if needed (e.g., use '\t' for tab-separated values)

# Load CSV into a DataFrame
#df = pd.read_csv(filename, sep=sep)

# Apply AutoViz
#AV.AutoViz(filename="", dfte=df)


# Clean Text

In [340]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Ensure stopwords are downloaded
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """Clean a string by lowercasing, removing punctuation/special characters, extra spaces, and stop words."""
    # Check if input is a string
    if not isinstance(text, str):
        return text

    original_text = text  # store the original for printing

    # Normalize text: lowercase
    text = text.lower()

    # Remove punctuation and special characters (keep letters, numbers, and whitespace)
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Remove stop words
    words = text.split()
    cleaned_words = [word for word in words if word not in stop_words]
    cleaned_text = ' '.join(cleaned_words)

    # Print before and after cleaning if there's a change
    #if original_text != cleaned_text:
    #    print("Original:", original_text, "=> Cleaned:", cleaned_text)
    
    return cleaned_text

def clean_text_column(df, column_name):
    """
    Cleans the text in the specified column of a DataFrame.
    
    For each row in the column, prints the original and cleaned text,
    then updates the DataFrame with the cleaned text.
    """
    df[column_name] = df[column_name].apply(clean_text)
    return df

string_columns = df.select_dtypes(include=['object','string']).columns.tolist()
#print(string_columns)
for column_name in string_columns: 
    # Clean the 'text' column
    df = clean_text_column(df,column_name )



# Transform/Manipulate data

In [342]:
# Convert 'birth_date' to datetime format
df['birth_date'] = pd.to_datetime(df['birth_date'], errors='coerce')

# Calculate the median date (ignoring NaN values)
median_birth_date = df['birth_date'].median()

# Fill NaN values in 'birth_date' with the median date
df['birth_date'] = df['birth_date'].fillna(median_birth_date)


# Extract the year from 'birth_date' and fill missing values in 'birth_year'
df['birth_year'] = df['birth_year'].fillna(df['birth_date'].dt.year)

# Mapping for month names to numbers
month_map = {
    'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
    'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12
}

# Reverse mapping for numbers back to month names
reverse_month_map = {v: k for k, v in month_map.items()}

# Step 1: Convert month names to numbers
df['birth_month'] = df['birth_month'].map(month_map).fillna(df['birth_month'])

# Step 2: Convert birth_month to numeric (handling any remaining strings)
df['birth_month'] = pd.to_numeric(df['birth_month'], errors='coerce')

# Step 3: Fill missing values using the month extracted from 'birth_date'
df['birth_month'] = df['birth_month'].fillna(df['birth_date'].dt.month)

df.drop('birth_date', axis=1, inplace=True)


In [343]:
df = df.drop(['height', 'weight'], axis=1)

def get_position(row):
    height = row['height_cm']
    weight = row['weight_kg']
    
    # Try to assign using both height and weight based on the table:
    # Note: Weight ranges in kg approximate the lbs ranges given.
    # Center: Height >= 208 cm and Weight >= 109 kg
    if height >= 208 and weight >= 109:
        return 'Center'
    # Power Forward (PF): Height 203-211 cm and Weight 100-118 kg
    elif 203 <= height <= 211 and 100 <= weight <= 118:
        return 'PF'
    # Small Forward (SF): Height 198-206 cm and Weight 95-104 kg
    elif 198 <= height <= 206 and 95 <= weight <= 104:
        return 'SF'
    # Shooting Guard (SG): Height 191-198 cm and Weight 86-95 kg
    elif 191 <= height <= 198 and 86 <= weight <= 95:
        return 'SG'
    # Point Guard (PG): Height 178-191 cm and Weight 75-86 kg
    elif 178 <= height <= 191 and 75 <= weight <= 86:
        return 'PG'
    else:
        # Fallback using height only:
        if height >= 208:
            return 'Center'
        elif height >= 203:
            return 'PF'
        elif height >= 198:
            return 'SF'
        elif height >= 191:
            return 'SG'
        elif height >= 178:
            return 'PG'
        else:
            return 'Unknown'

# Example usage on your DataFrame:
df['Position'] = df.apply(get_position, axis=1)

# Display the first few rows to verify the results
print(df[['height_cm', 'weight_kg', 'Position']].head())


   height_cm  weight_kg Position
0    216.0      147.0    Center 
1    198.0      100.0        SF 
2    206.0      120.0        PF 
3    183.0       75.0        PG 
4    193.0       82.0        SG 


In [344]:
df = df.astype({col: 'string' for col in ['Player','Team','draft_team','high_school']})

In [345]:

#Overall Position=(Round−1)×30+Pick in Round
mask = df['draft_round'].notna() & df['draft_pick'].notna()
df.loc[mask, 'draft_position'] = (df.loc[mask, 'draft_round'] - 1) * 30 + df.loc[mask, 'draft_pick']
df['draft_position'] = df['draft_position'].fillna(0)

#drop columns
df = df.drop(['REB', 'PTS','draft_pick','draft_round'], axis=1)
#REB=ORB+DRB
#PTS=FGM+3PM+FTM
#height=height_cm
#weight=weight_kg
df['NBA'] = (df['draft_position'] > 0).astype(int)

# Option using .loc with a case-insensitive comparison:
df.loc[df['League'].str.lower() != 'nba', 'League'] = 'Other'


In [346]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53949 entries, 0 to 53948
Data columns (total 30 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   League          53949 non-null  object 
 1   Season          53949 non-null  object 
 2   Stage           53949 non-null  object 
 3   Player          53949 non-null  string 
 4   Team            53938 non-null  string 
 5   GP              53949 non-null  int64  
 6   MIN             53949 non-null  float64
 7   FGM             53949 non-null  int64  
 8   FGA             53949 non-null  int64  
 9   3PM             53949 non-null  int64  
 10  3PA             53949 non-null  int64  
 11  FTM             53949 non-null  int64  
 12  FTA             53949 non-null  int64  
 13  TOV             53949 non-null  int64  
 14  PF              53949 non-null  int64  
 15  ORB             53949 non-null  int64  
 16  DRB             53949 non-null  int64  
 17  AST             53949 non-null 

# Data Cleansing

In [348]:
# Fill missing values in the 'Team' column with 'team_name' + index
df['Team'] = df['Team'].fillna(df.index.to_series().apply(lambda i: f'team_name{i}'))
df['draft_team'] = df['draft_team'].fillna('N/A')
df['high_school'] = df['high_school'].fillna('N/A')
df['nationality'] = df['high_school'].fillna('N/A')

In [349]:
from sklearn.impute import KNNImputer
imp_cols = ['height_cm','weight_kg']
knn_imputer = KNNImputer(n_neighbors=3)
imputed_data = pd.DataFrame(knn_imputer.fit_transform(df[imp_cols]), columns=imp_cols)
df[imp_cols] = imputed_data

df['BMI'] = df['weight_kg'] / ((df['height_cm'] / 100) ** 2)


In [350]:
#save_dataframe(df,'Feature Engineering')
df=load_dataframe('Feature Engineering')
df.info()

DataFrame saved to Feature Engineering
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53949 entries, 0 to 53948
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   League          53949 non-null  object 
 1   Season          53949 non-null  object 
 2   Stage           53949 non-null  object 
 3   Player          53949 non-null  string 
 4   Team            53949 non-null  string 
 5   GP              53949 non-null  int64  
 6   MIN             53949 non-null  float64
 7   FGM             53949 non-null  int64  
 8   FGA             53949 non-null  int64  
 9   3PM             53949 non-null  int64  
 10  3PA             53949 non-null  int64  
 11  FTM             53949 non-null  int64  
 12  FTA             53949 non-null  int64  
 13  TOV             53949 non-null  int64  
 14  PF              53949 non-null  int64  
 15  ORB             53949 non-null  int64  
 16  DRB             53949 non-null  int64

# Feature Engineering & Feature Selection

In [352]:
from IPython.core.display import display, HTML

# Display scrollable output for Manu Ginobili after grouping
#display(HTML(df[df['Player'] == 'manu ginobili'].to_html(
#    notebook=True, escape=False, max_rows=500, max_cols=50, border=1
#)))

# Filter only Regular_Season and Playoffs
df_filtered = df[df['Stage'].isin(['regularseason', 'playoffs'])]

# Define the groupby columns (these should remain unchanged)
groupby_cols = [
    'League', 'Season','Player', 'Team', 'birth_year', 'birth_month',
    'height_cm', 'weight_kg', 'nationality', 'high_school',
    'draft_position', 'draft_team','Position','BMI','NBA'
]

# Perform groupby and sum the numeric columns
df_merged = df_filtered.groupby(groupby_cols, as_index=False).sum(numeric_only=True)

# Set the stage as "NBA Season" for merged data
df_merged['Stage'] = 'NBA Season'

# Remove original Regular_Season & Playoffs rows from df
df = df[~df['Stage'].isin(['regularseason', 'playoffs'])]

# Append the updated (merged) data to df
df = pd.concat([df, df_merged], ignore_index=True)

# Display scrollable output for Manu Ginobili after grouping
display(HTML(df[df['Player'] == 'manu ginobili'].to_html(
    notebook=True, escape=False, max_rows=500, max_cols=50, border=1
)))


Unnamed: 0,League,Season,Stage,Player,Team,GP,MIN,FGM,FGA,3PM,3PA,FTM,FTA,TOV,PF,ORB,DRB,AST,STL,BLK,birth_year,birth_month,height_cm,weight_kg,nationality,high_school,draft_team,Position,draft_position,NBA,BMI
1,Other,2000 2001,international,manu ginobili,bol,22,654.4,110,247,30,103,84,108,56,70,17,74,44,64,7,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069
169,Other,2001 2002,international,manu ginobili,bol,14,385.1,69,157,17,56,53,73,29,50,13,41,41,35,3,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069
47110,nba,2002 2003,NBA Season,manu ginobili,sas,93,2092.0,245,581,79,221,182,245,136,232,76,177,208,137,26,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069
47353,nba,2003 2004,NBA Season,manu ginobili,sas,87,2542.0,372,883,98,280,275,342,182,211,101,296,322,153,17,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069
47581,nba,2004 2005,NBA Season,manu ginobili,sas,97,2965.0,512,1066,139,354,502,627,238,259,94,368,385,147,33,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069
47815,nba,2005 2006,NBA Season,manu ginobili,sas,78,2238.3,385,826,97,259,353,447,154,199,51,238,274,120,33,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069
48038,nba,2006 2007,NBA Season,manu ginobili,sas,95,2662.6,495,1101,166,422,417,488,199,216,75,361,337,142,31,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069
48270,nba,2007 2008,NBA Season,manu ginobili,sas,91,2857.9,551,1216,194,491,449,519,247,220,77,341,399,119,38,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069
48726,nba,2009 2010,NBA Season,manu ginobili,sas,85,2502.2,456,1043,152,410,367,422,184,184,72,249,430,129,26,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069
48951,nba,2010 2011,NBA Season,manu ginobili,sas,85,2599.8,472,1088,163,469,389,451,191,176,44,271,414,136,31,1977.0,7.0,198.0,93.0,,,san antonio spurs,SG,58.0,1,23.722069


In [353]:
print(missing_stats(df))

                Missing Count  Total Rows  Missing Percentage
League                0           51440            0.0       
Season                0           51440            0.0       
Stage                 0           51440            0.0       
Player                0           51440            0.0       
Team                  0           51440            0.0       
GP                    0           51440            0.0       
MIN                   0           51440            0.0       
FGM                   0           51440            0.0       
FGA                   0           51440            0.0       
3PM                   0           51440            0.0       
3PA                   0           51440            0.0       
FTM                   0           51440            0.0       
FTA                   0           51440            0.0       
TOV                   0           51440            0.0       
PF                    0           51440            0.0       
ORB     

# One-Hot Encoding

In [355]:
print(df['NBA'].value_counts())

df['League'] = df['League'].astype('category')
df['League']=convert_LabelEncoder(df,'League')
print(df['League'].value_counts())


df['Season'] = df['Season'].astype('category')
df['Season']=convert_LabelEncoder(df,'Season')
print(df['Season'].value_counts())

df['Stage'] = df['Stage'].astype('category')
df['Stage']=convert_LabelEncoder(df,'Stage')
print(df['Stage'].value_counts())

df['nationality'] = df['nationality'].astype('category')
df['nationality']=convert_LabelEncoder(df,'nationality')
print(df['nationality'].value_counts())


df['Position'] = df['Position'].astype('category')
df['Position']=convert_LabelEncoder(df,'Position')
print(df['Position'].value_counts())


df['Team'] = df['Team'].astype('category')
df['Team']=convert_LabelEncoder(df,'Position')
print(df['Team'].value_counts())

        

NBA
0    43590
1     7850
Name: count, dtype: int64
League
0    46316
1     5124
Name: count, dtype: int64
Season
20    7145
19    5019
18    4679
17    4177
16    4146
13    3921
14    3878
15    3847
12    3700
8     1301
5     1255
10    1179
9     1175
11    1167
7     1113
6     1084
4     1055
3      579
2      415
1      377
0      228
Name: count, dtype: int64
Stage
1    46316
0     5124
Name: count, dtype: int64
nationality
4       29973
1938      209
1076      142
7         110
858       108
        ...  
2724        1
3026        1
2901        1
2455        1
2970        1
Name: count, Length: 3073, dtype: int64
Position
4    14245
2    11095
1    10679
3    10247
0     4680
5      494
Name: count, dtype: int64
Team
4    14245
2    11095
1    10679
3    10247
0     4680
5      494
Name: count, dtype: int64


In [356]:
missing_stats(df)

                Missing Count  Total Rows  Missing Percentage
League                0           51440            0.0       
Season                0           51440            0.0       
Stage                 0           51440            0.0       
Player                0           51440            0.0       
Team                  0           51440            0.0       
GP                    0           51440            0.0       
MIN                   0           51440            0.0       
FGM                   0           51440            0.0       
FGA                   0           51440            0.0       
3PM                   0           51440            0.0       
3PA                   0           51440            0.0       
FTM                   0           51440            0.0       
FTA                   0           51440            0.0       
TOV                   0           51440            0.0       
PF                    0           51440            0.0       
ORB     

In [357]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51440 entries, 0 to 51439
Data columns (total 31 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   League          51440 non-null  int32  
 1   Season          51440 non-null  int32  
 2   Stage           51440 non-null  int32  
 3   Player          51440 non-null  string 
 4   Team            51440 non-null  int64  
 5   GP              51440 non-null  int64  
 6   MIN             51440 non-null  float64
 7   FGM             51440 non-null  int64  
 8   FGA             51440 non-null  int64  
 9   3PM             51440 non-null  int64  
 10  3PA             51440 non-null  int64  
 11  FTM             51440 non-null  int64  
 12  FTA             51440 non-null  int64  
 13  TOV             51440 non-null  int64  
 14  PF              51440 non-null  int64  
 15  ORB             51440 non-null  int64  
 16  DRB             51440 non-null  int64  
 17  AST             51440 non-null 

In [358]:
#df.drop(['League','Stage','Team'], axis=1, inplace=True)
save_dataframe(df, 'dataframe_ready')
df_model = df.select_dtypes(include=['number'])


DataFrame saved to dataframe_ready


In [359]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_model, df.NBA, test_size=0.35, random_state=47)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, y_train)
pred = model.predict(X_test)

In [360]:
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
# Calculate confusion matrix
cm = confusion_matrix(y_test, pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[15214    33]
 [   66  2691]]


In [361]:
accuracy = (cm[0, 0] + cm[1, 1]) / np.sum(cm)
accuracy

0.9945012219506776

In [362]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15247
           1       0.99      0.98      0.98      2757

    accuracy                           0.99     18004
   macro avg       0.99      0.99      0.99     18004
weighted avg       0.99      0.99      0.99     18004

