In [None]:
import warnings
warnings.filterwarnings("ignore")

We are going to use below mentioned libraries for **data import, processing and visulization**. As we progress, we will use other specific libraries for model building and evaluation. 

In [None]:
import pandas as pd 
import numpy as np
import seaborn as sn # visualization library based on matplotlib
import matplotlib.pylab as plt


## Data Import and Manipulation

### 1. Importing a data set

_Give the correct path to the data_



modify the ast_note_interactivity kernel option to see the value of multiple statements at once.

In [None]:
earnings_df = pd.read_csv( "https://raw.githubusercontent.com/manaranjanp/MICA_Classes/main/cases/fraud_data_new.csv")

In [None]:
earnings_df.sample(10)

Dropping company_id as these will not be used for any analysis or model building.

In [None]:
earnings_df.drop(['company_id'],axis=1, inplace=True)
    
earnings_df.head()


### 2. Structure of the dataset



In [None]:
earnings_df.info();

In [None]:
earnings_df.status.value_counts()

To get a help on the features of a object

We will first start by printing the unique labels in categorical features

In [None]:
x_features = ['dsri', 'gmi', 'aqi', 'sgi', 'depi', 'sgai', 'accr', 'levi']

In [None]:
earnings_df.dsri.min(), earnings_df.dsri.max() 

## EDA

In [None]:
sn.histplot(data = earnings_df[earnings_df.status == 'No'],
           x = 'dsri',
           hue = 'status',
           bins = range(0, 35, 5));

In [None]:
sn.histplot(data = earnings_df[earnings_df.status == 'Yes'],
           x = 'dsri',
           hue = 'status',
           bins = range(0, 35, 5));

## Model Building: 

In [None]:
X = earnings_df[x_features]
y = earnings_df.status.map(lambda x: 1 if x == 'Yes' else 0)

In [None]:
y.value_counts()

### Train and test data split using Python

The train and test split can also be done using the **sklearn module**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size = 0.3, 
                                                    random_state = 42)

## Class Weights

In [None]:
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(class_weight= 'balanced', 
                                     classes = np.unique(y_train),
                                     y = y_train)
                                     
class_weights_dict = dict(zip(np.unique(y_train), class_weights))
class_weights_dict

## Model Building: Using the **sklearn** 



In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
"""
The “balanced” mode uses the values of y to automatically adjust weights 
inversely proportional to class frequencies in the input data as 
n_samples / (n_classes * np.bincount(y))
"""

tree_model = DecisionTreeClassifier(max_depth=5, 
                                    criterion='gini',
                                    class_weight=class_weights_dict)

tree_model.fit(X_train,y_train)

In [None]:
y_pred = tree_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, y_pred))

## Model Evaluation

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_test, y_pred, labels=[1,0])