In [1]:
#importing required packages

#modelues for EDA steps
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#modules for data cleaning and data analysis
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
import scipy.stats as stats

#modules for model building
#algorithms for sampling
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

#baseline linear model
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#modules for hyper parameter tuning
from sklearn.model_selection import GridSearchCV

#modules for model evaluation
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, accuracy_score, f1_score, r2_score
from sklearn.metrics import precision_recall_curve, roc_curve

#modules for avoiding warnings
import warnings
warnings.filterwarnings('ignore')

#setting backend for matplotlib
%matplotlib inline

#setting formatting options
pd.options.display.max_columns = 100
pd.options.display.max_rows = 900
pd.set_option('float_format' , '{:f}'.format)

#setting plot style
plt.style.use('seaborn-darkgrid')
#loading the dataset
fraud_train = pd.read_csv('fraudTrain.csv')
fraud_test = pd.read_csv('fraudTest.csv')

#concatenating the two datasets
df = pd.concat([fraud_train, fraud_test]).reset_index()

df.drop(df.columns[:2], axis=1, inplace=True)
df.head()
#converting trans_date_trans_time into datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
print(df.dtypes['trans_date_trans_time'])
df.head()
#let us look at the number of unique values in the dataset
df.nunique()
# deriving additonal columns from 'trans_date_trans_time'
#deriving hour
df['trans_hour'] = df['trans_date_trans_time'].dt.hour
#deriving 'day of the week'
df['trans_day_of_week'] = df['trans_date_trans_time'].dt.day_name()
#deriving 'year_month'
df['trans_year_month'] = df['trans_date_trans_time'].dt.to_period('M')

df.head()
#finding age
#converting 'dob' column to datetime
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = np.round((df['trans_date_trans_time'] - 
                      df['dob'])/np.timedelta64(1, 'Y'))
#dropping variables
df.drop(['trans_date_trans_time','first', 'last', 'dob'] , axis=1, inplace=True)
df.head()
#checking the dataset info
df.info()
#storing a copy
df_org = df.copy()
#determing the shape of the dataset
df.shape
#checking the dataframe
df.head()
#describing the dataset
df.describe()
#let us check the percentage of fraudulent data points in our dataset
100*df.is_fraud.value_counts(normalize=True)
#looking at distribution of amount
pd.concat(
    [df['amt'].describe(percentiles = [0.5,0.95,0.999]).reset_index().rename(columns={'index': 'Row Type', 'amt':'Overall Amt Distribution'}),
     df.loc[df['is_fraud']==0,['amt']].describe(percentiles = [0.5,0.95,0.999]).reset_index(drop = 1).rename(columns={'amt':'Non-Fraud Amt Distribution'}),
     df.loc[df['is_fraud']==1,['amt']].describe(percentiles = [0.5,0.95,0.999]).reset_index(drop = 1).rename(columns={'amt':'Fraud Amt Distribution'})], axis=1)
#plotting the above distributions
fig = plt.subplots(figsize=(15,10))

plots = []
#plotting the amt feature
#box plot
plots.append(sns.boxplot(df.amt, ax=plt.subplot(211)))

#distribution plots
plots.append(sns.histplot(df[df.amt <= 1500].amt, bins=50, ax=plt.subplot(234)))
plots.append(sns.histplot(df[(df.is_fraud==0) & (df.amt<=1500)].amt, bins=50, ax=plt.subplot(235)))
plots.append(sns.histplot(df[(df.is_fraud==1) & (df.amt<=1500)].amt, bins=50, ax=plt.subplot(236)))

#setting titles
plots[1].set_title('Overall amt Dist')
plots[2].set_title('Non Fraud amt Dist')
plots[3].set_title('Fraud amt Dist')

#setting x labels
plots[1].set_xlabel('Transaction Amount')
plots[2].set_xlabel('Transaction Amount')
plots[3].set_xlabel('Transaction Amount')

#setting y label
plots[1].set_ylabel('Number of transactions')

plt.show()


# Create subplots
fig, ax = plt.subplots(2, 2, figsize=(12, 10))

# Flatten the axes array for easier indexing
ax = ax.flatten()

# Plot the 'trans_hour' feature
sns.countplot(x='trans_hour', data=df, ax=ax[0])
ax[0].set_title("Transaction Hour")

# Plot the 'trans_day_of_week' feature
sns.countplot(x='trans_day_of_week', data=df, ax=ax[1])
ax[1].set_title("Transaction Day of Week")

# Plot the 'trans_year_month' feature
sns.countplot(x='trans_year_month', data=df, ax=ax[2])
ax[2].set_title("Transaction Year-Month")

# Adjust x-axis tick labels for better readability
for i in range(3):
    ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=30)

# Hide the last subplot if unused
fig.delaxes(ax[3])

# Display the plots
plt.tight_layout()
plt.show()

#year_month vs number of transactions
df_timeline01 = df.groupby(df['trans_year_month'])[['trans_num','cc_num']].nunique().reset_index()
df_timeline01.columns = ['year_month','num_of_transactions','customers']
df_timeline01
x = np.arange(0,len(df_timeline01),1)

fig, ax = plt.subplots(1,1,figsize=(20,5))
ax.plot(x,df_timeline01['num_of_transactions'])
ax.set_xticks(x)
ax.set_xticklabels(df_timeline01['year_month'])

ax.set_xlabel('Year Month')
ax.set_ylabel('Num of Transactions')
plt.show()
df_fraud_transactions = df[df['is_fraud']==1]

df_timeline02 = df_fraud_transactions.groupby(df_fraud_transactions['trans_year_month'])[['trans_num','cc_num']].nunique().reset_index()
df_timeline02.columns = ['year_month','num_of_fraud_transactions','fraud_customers']
df_timeline02
x = np.arange(0,len(df_timeline02),1)

fig, ax = plt.subplots(1,1,figsize=(20,5))
ax.plot(x,df_timeline02['fraud_customers'])
ax.set_xticks(x)
ax.set_xticklabels(df_timeline02['year_month'])

ax.set_xlabel('Year Month')
ax.set_ylabel('Number of Fraud customers')
plt.show()
# Create subplots
fig = plt.figure(figsize=(15, 10))

# Plotting gender demographic with respect to transactions
ax1 = plt.subplot(2, 1, 1)  # Top plot spanning full row
sns.countplot(x='gender', data=df, ax=ax1)
ax1.set_title("Gender Demographic with Respect to Transactions")

# Plotting transactions over time with respect to gender
ax2 = plt.subplot(2, 3, 4)  # Bottom left
sns.countplot(x='trans_hour', hue='gender', data=df, ax=ax2)
ax2.set_title("Transactions by Hour with Gender")

ax3 = plt.subplot(2, 3, 5)  # Bottom middle
sns.countplot(x='trans_day_of_week', hue='gender', data=df, ax=ax3)
ax3.set_title("Transactions by Day of Week with Gender")

ax4 = plt.subplot(2, 3, 6)  # Bottom right
sns.countplot(x='trans_year_month', hue='gender', data=df, ax=ax4)
ax4.set_title("Transactions by Year-Month with Gender")

# Rotate x-axis tick labels for all subplots
for ax in [ax1, ax2, ax3, ax4]:
    ax.set_xticklabels(ax.get_xticklabels(), rotation=90)

# Adjust layout for better spacing
plt.tight_layout()
plt.show()
#creating the 'gender' distributed dataframe
df_gender = df[['gender','trans_num']].groupby(['gender']).count().reset_index()
df_gender.columns = ['Gender', 'gender_count']

#creating gender-fraud distribution
df_fraud_gender = df[['gender','trans_num', 'is_fraud']].groupby(['gender','is_fraud']).count().reset_index()
df_fraud_gender.columns = ['Gender', 'is_fraud', 'Transaction Count']

df_fraud_gender = df_fraud_gender.merge(df_gender[['Gender', 'gender_count']], how='inner', on='Gender')

df_fraud_gender['Transaction percentage'] = (df_fraud_gender['Transaction Count']/df_fraud_gender['gender_count'])*100

df_fraud_gender
sns.barplot(data=df_fraud_gender, y='Transaction Count', x='Gender', hue='is_fraud')

plt.show()
#let us first bin the age feature
for i in range(len(df.age)):
  if df.age[i] <= 30:
    df.age[i] = '< 30'
  elif df.age[i] > 30 and df.age[i] <= 45:
    df.age[i] = '30-45'
  elif df.age[i] > 45 and df.age[i] <= 60:
    df.age[i] = '46-60'
  elif df.age[i] > 60 and df.age[i] <= 75:
    df.age[i] = '61-75'
  else:
    df.age[i] = '> 75'

df.age.head()
# Ensure age is treated as a categorical variable if it's not already
df['age'] = df['age'].astype('category')

# Plotting the age feature in the data
plot = sns.countplot(x='age', data=df)

# Rotate x-axis labels for readability
plot.set_xticklabels(plot.get_xticklabels(), rotation=45)

# Display the plot
plt.title("Age Distribution")
plt.tight_layout()
plt.show()
#constructing the age-transaction count distribution
df_age = df[['age','trans_num']].groupby(['age']).count().reset_index()
df_age.columns = ['age', 'age_count']

#creating the age-fraud distribution
df_fraud_age = df[['age', 'trans_num', 'is_fraud']].groupby(['age','is_fraud']).count().reset_index()
df_fraud_age.columns = ['age', 'is_fraud', 'Transaction count']

df_fraud_age = df_fraud_age.merge(df_age[['age', 'age_count']], how='inner', on='age')

df_fraud_age['Transaction percentage'] = (df_fraud_age['Transaction count']/df_fraud_age['age_count'])*100

df_fraud_age
sns.barplot(data=df_fraud_age, y='Transaction count', x='age', hue='is_fraud')

plt.show()
df.state.nunique()
# Create a DataFrame for fraud transaction counts by state
train_state_fraud = df.groupby(['state']).sum('is_fraud').sort_values('is_fraud', ascending=False)
train_state_fraud.reset_index(level=0, inplace=True)
train_state_fraud = train_state_fraud.sort_values('state', ascending=True)
train_state_fraud.reset_index(inplace=True)
train_state_fraud.drop(columns='index', inplace=True)

# Create a DataFrame for non-fraud transaction counts by state
train_state_non_fraud_counts = df.groupby('state').agg(non_fraud_count=('is_fraud', lambda x: (x == 0).sum()))

from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Create subplots (1 row, 2 columns)
fig = make_subplots(
    rows=1, cols=2, 
    subplot_titles=['Fraud Transactions by State', 'Non-Fraud Transactions by State'], 
    specs=[[{"type": "choropleth"}, {"type": "choropleth"}]]
)

# First plot: Fraud transactions
fig.add_trace(go.Choropleth(
    locations=train_state_fraud['state'], 
    text=train_state_fraud['state'],
    z=train_state_fraud['is_fraud'].astype(float), 
    locationmode='USA-states', 
    colorscale='Reds',
    colorbar=dict(title="is_fraud", x=0.45)  # Position colorbar next to the first graph
), row=1, col=1)

# Second plot: Non-fraud transactions
fig.add_trace(go.Choropleth(
    locations=train_state_non_fraud_counts.index, 
    text=train_state_non_fraud_counts.index,
    z=train_state_non_fraud_counts['non_fraud_count'].astype(float), 
    locationmode='USA-states', 
    colorscale='Blues',
    colorbar=dict(title="non_fraud", x=1.0)  # Position colorbar next to the second graph
), row=1, col=2)

# Set the geo scope for each subplot
fig.update_geos(scope='usa', row=1, col=1)
fig.update_geos(scope='usa', row=1, col=2)

# Update layout
fig.update_layout(
    title_text="Fraud vs Non-Fraud Transactions by State",
    title_x=0.5,             # Center title
    title_xanchor='center'   # Anchor title in the center
)

# Display the figure
fig.show()

#fetching states with high transaction frequecy
high_trans_states = df.state.value_counts().head(20).index.tolist()
print(high_trans_states)
100*df[df.state.isin(high_trans_states)].state.value_counts(normalize=True)
fig, ax = plt.subplots(figsize=(10, 8))

# Plotting state feature
plot = sns.countplot(x='state', data=df, ax=ax)
plot.set_xticklabels(plot.get_xticklabels(), rotation=90)

plt.show()

#constructing the state-transaction count distribution
df_state = df[['state','trans_num']].groupby(['state']).count().reset_index()
df_state.columns = ['state', 'state_count']

#creating the state-fraud distribution
df_fraud_state = df[['state', 'trans_num', 'is_fraud']].groupby(['state','is_fraud']).count().reset_index()
df_fraud_state.columns = ['state', 'is_fraud', 'Transaction count']

df_fraud_state = df_fraud_state.merge(df_state[['state', 'state_count']], how='inner', on='state')

df_fraud_state['Transaction percentage'] = (df_fraud_state['Transaction count']/df_fraud_state['state_count'])*100

#viewing the top 20 states with high fraudulent transactions
df_fraud_state[df_fraud_state['is_fraud'] == 1].sort_values(by = ['Transaction percentage'], ascending=False).head(20)
#states with more than 75% fraudulent transactions
print('state with more than 75% fraudulent transactions:\n')
print(df_fraud_state.loc[(df_fraud_state.is_fraud == 1) & (df_fraud_state['Transaction percentage'] >= 75)].state)
print('number of cities: ',df.city.nunique())
print('number of zip codes: ',df.zip.nunique())
high_trans_cities = df.city.value_counts().head(20).index.tolist()
high_trans_zips = df.zip.value_counts().head(20).index.tolist()
print('high frequencies cities: ', high_trans_cities)
print('high frequencies zip codes: ', high_trans_zips)
df.category.value_counts(normalize=True)
# Plotting the category-wise counts
plot = sns.countplot(x='category', data=df)
plot.set_xticklabels(plot.get_xticklabels(), rotation=90)

plt.show()

# Plotting the number of fraudulent transactions in each category
plot = sns.countplot(x='category', hue='is_fraud', data=df)
plot.set_xticklabels(plot.get_xticklabels(), rotation=90)

plt.show()

#constructing the category-transaction count distribution
df_category = df[['category','trans_num']].groupby(['category']).count().reset_index()
df_category.columns = ['category', 'category_count']

#creating the zip-fraud distribution
df_fraud_category = df[['category', 'trans_num', 'is_fraud']].groupby(['category','is_fraud']).count().reset_index()
df_fraud_category.columns = ['category', 'is_fraud', 'Transaction count']

df_fraud_category = df_fraud_category.merge(df_category[['category', 'category_count']], how='inner', on='category')

df_fraud_category['Transaction percentage'] = (df_fraud_category['Transaction count']/df_fraud_category['category_count'])*100

#viewing the top categories with high fraudulent transaction volumes
df_fraud_category[df_fraud_category['is_fraud'] == 1].sort_values(by = ['Transaction percentage'], ascending=False)
#categories with more than one percent fraudulent transactions
df_fraud_category.loc[(df_fraud_category.is_fraud == 1) & (df_fraud_category['Transaction percentage'] >= 1)].category
df_fraud_category.head()
df.merchant.nunique()
high_trans_merchants = df.merchant.value_counts().head(20).index.tolist()
high_trans_merchants
# Plotting the top merchants with high transaction volumes
plot = sns.countplot(x='merchant', data=df[df['merchant'].isin(high_trans_merchants)])
plot.set_xticklabels(plot.get_xticklabels(), rotation=90)

plt.show()

#constructing the merchant-transaction count distribution
df_merchant = df[['merchant','trans_num']].groupby(['merchant']).count().reset_index()
df_merchant.columns = ['merchant', 'merchant_count']

#creating the zip-fraud distribution
df_fraud_merchant = df[['merchant', 'trans_num', 'is_fraud']].groupby(['merchant','is_fraud']).count().reset_index()
df_fraud_merchant.columns = ['merchant', 'is_fraud', 'Transaction count']

df_fraud_merchant = df_fraud_merchant.merge(df_merchant[['merchant', 'merchant_count']], how='inner', on='merchant')

df_fraud_merchant['Transaction percentage'] = (df_fraud_merchant['Transaction count']/df_fraud_merchant['merchant_count'])*100

#viewing the top 20 merchant with high fraudulent transaction volumes
df_fraud_merchant[df_fraud_merchant['is_fraud'] == 1].sort_values(by = ['Transaction percentage'], ascending=False).head(20)

#number of merchants with more than one percent fraudulent transactions
df_fraud_merchant.loc[(df_fraud_merchant.is_fraud == 1) & (df_fraud_merchant['Transaction percentage'] >= 2)].merchant
#one-hot encoding the category variable
category_onehot = pd.get_dummies(df.category, prefix='category', drop_first=True)
#one-hot encoding the gender variable
gender_onehot = pd.get_dummies(df.gender, prefix='gender', drop_first=True)
#one-hot encoding the day_of_week variable
day_of_week_onehot = pd.get_dummies(df.trans_day_of_week, prefix='day', drop_first=True)
#one-hot encoding the age variable
age_onehot = pd.get_dummies(df.age, prefix='age', drop_first=True)
df1 = pd.concat([df, category_onehot,gender_onehot,day_of_week_onehot,age_onehot], axis=1)

df1.head()
df1.drop([ 'cc_num', 'trans_num'], axis=1, inplace=True)
print(df1.shape)
df1.columns
df1.drop(['merchant','street','city','state','job',
          'category','gender','trans_day_of_week',
          'age'],axis=1, inplace=True)
df1.columns
df1.info()
df1 = df1.drop(columns=['trans_year_month'])
#let us now check the correlations between the columns
df_random_under_corr = df1.corr()
#plotting the correlation heatplot
plt.figure(figsize=(15,15))
sns.heatmap(df_random_under_corr)
plt.show()
#function to return highly correlated column above a threshold
def correlation(dataset, threshold):
    col_corr = set() # This set stores the highly correlated columns
    corr_matrix = dataset.corr() #correlation matrix
    #traversing the correlation matrix
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if corr_matrix.iloc[i,j] > threshold:
                colname = corr_matrix.columns[i] #selecting columns above threshold
                col_corr.add(colname) #adding columns to set
    return col_corr
#let us get the features with correlation above 85%
corr_features = correlation(df1,0.85)
corr_features
# Compute the correlation matrix
correlation_matrix = df1.corr()

# Display the matrix
print(correlation_matrix)

import numpy as np

# Compute the correlation matrix
corr_matrix = df1.corr().abs()

# Select the upper triangle of the matrix
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation above the threshold
high_corr = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.8)]
print("Highly correlated features:", high_corr)

df2=df1
df2.info()
df2.head()
print(df1.info())

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Define X (features) and y (target)
X = df1.drop(columns=['is_fraud'])
y = df1['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Rank features by importance
importances = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
print(importances)



# model = RandomForestClassifier()
# model.fit(X, y)

# # Rank features by importance
# importances = pd.DataFrame({'Feature': X.columns, 'Importance': model.feature_importances_})
# importances = importances.sort_values(by='Importance', ascending=False)
# print(importances)

# # Drop features with low importance
# low_importance_features = importances[importances['Importance'] < 0.01]['Feature']
# df1 = df1.drop(columns=low_importance_features)

# Drop features with low importance
low_importance_features = importances[importances['Importance'] < 0.01]['Feature']
df1 = df1.drop(columns=low_importance_features)
df1.info()