# Convolve Epoch 2
## Round 2
---


# Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import dask.dataframe as dd
import seaborn as sns
import plotly.express as px
import plotly.io as pio
import os

In [2]:

# Initializing training and final testing data 

train_data_filename = "/mnt/c/Users/parth/Desktop/CODING/Convolve_Round2/dataset/Dev_data_to_be_shared.xlsx"
test_data_filename = "/mnt/c/Users/parth/Desktop/CODING/Convolve_Round2/dataset/validation_data_to_be_shared.xlsx"


Reading the `Excel` files

In [3]:
train_df = pd.read_excel(train_data_filename, header=0)

In [4]:
train_df.to_csv("train_df.csv",  
                  index = None, 
                  header=True) 

In [5]:
test_df = pd.read_excel(test_data_filename, header=0)

In [6]:
test_df.to_csv("test_df.csv",  
                  index = None, 
                  header=True) 

Reading the `.info()`  

In [7]:
train_df.describe()


Unnamed: 0,Primary key,Target,account_opening_date,demog_1,demog_3,demog_5,demog_6,demog_7,demog_8,demog_9,...,others_41,txn_80,txn_81,demog_39,demog_41,others_42,others_43,others_44,others_45,demog_42
count,100000.0,100000.0,100000,96331.0,99994.0,99975.0,99999.0,99994.0,96249.0,87086.0,...,99999.0,42522.0,48535.0,100000.0,100000.0,2105.0,4579.0,874.0,694.0,100000.0
mean,50000.5,0.02,2023-01-25 16:25:19.200000512,79.543376,101.323279,3.290163,1.01718,1.0,41.191701,0.326103,...,0.00222,34.742439,22.679159,0.99994,0.96329,87.887886,62.024896,28.808924,96.700288,0.95847
min,1.0,0.0,2022-12-01 00:00:00,51.0,101.0,1.0,1.0,1.0,41.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,25000.75,0.0,2022-12-26 00:00:00,55.0,101.0,3.0,1.0,1.0,41.0,0.0,...,0.0,2.0,6.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
50%,50000.5,0.0,2023-01-25 00:00:00,55.0,101.0,4.0,1.0,1.0,41.0,0.0,...,0.0,5.0,12.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
75%,75000.25,0.0,2023-02-24 00:00:00,55.0,101.0,4.0,1.0,1.0,41.0,1.0,...,0.0,15.0,27.0,1.0,1.0,2.0,4.0,0.0,0.0,1.0
max,100000.0,1.0,2023-03-31 00:00:00,421.0,213.0,4.0,4.0,1.0,52.0,1.0,...,1.0,13907.0,7810.0,1.0,1.0,4049.0,3283.0,2922.0,3603.0,1.0
std,28867.657797,0.140001,,89.498781,5.350011,0.833449,0.182553,0.0,0.641951,0.468788,...,0.047065,251.845838,49.903348,0.007746,0.18805,298.951251,200.384998,136.963658,370.396893,0.199514


In [8]:
train_df.dtypes


Primary key                      int64
Target                           int64
account_opening_date    datetime64[ns]
country_code                    object
demog_1                        float64
                             ...      
others_43                      float64
others_44                      float64
others_45                      float64
demog_42                         int64
demog_43                        object
Length: 178, dtype: object

In [9]:
train_df.nunique()


Primary key             100000
Target                       2
account_opening_date       121
country_code                49
demog_1                     16
                         ...  
others_43                  479
others_44                   99
others_45                  135
demog_42                     2
demog_43                     3
Length: 178, dtype: int64

In [10]:
train_df.isnull().sum()

Primary key                 0
Target                      0
account_opening_date        0
country_code             3666
demog_1                  3669
                        ...  
others_43               95421
others_44               99126
others_45               99306
demog_42                    0
demog_43                 3705
Length: 178, dtype: int64

# Seeing the `columns` of the `train_df`

In [11]:
train_df.columns

Index(['Primary key', 'Target', 'account_opening_date', 'country_code',
       'demog_1', 'demog_2', 'income', 'demog_3', 'city_tier', 'occupation',
       ...
       'demog_39', 'email_domain', 'demog_40', 'demog_41', 'others_42',
       'others_43', 'others_44', 'others_45', 'demog_42', 'demog_43'],
      dtype='object', length=178)

# Plotting the `cols` against `Target`

In [17]:

# Exclude 'Primary key' column and split into batches of 40 columns each
columns_to_plot = [col for col in train_df.columns if col != 'Primary key']
batch_size = 40

# Create a directory to save the images (change the path as needed)
image_directory = '/mnt/c/Users/parth/Desktop/CODING/Convolve_Round2/image_directory/'
os.makedirs(image_directory, exist_ok=True)

# Create plots for the first batch of attributes against 'Target' and save as images
for i in range(0, min(batch_size, len(columns_to_plot))):
    column = columns_to_plot[i]
    fig = px.histogram(train_df, x=column, color='Target', title=f'{column} vs Target')
    
    # Save the figure as an image
    image_file_path = f'{image_directory}{column}_vs_Target.png'
    pio.write_image(fig, image_file_path)
    



In [18]:
# Create plots for the second batch of attributes against 'Target'
for i in range(batch_size, 2 * batch_size):
    if i >= len(columns_to_plot):
        break
    column = columns_to_plot[i]
    fig = px.histogram(train_df, x=column, color='Target', title=f'{column} vs Target')
    # Save the figure as an image
    image_file_path = f'{image_directory}{column}_vs_Target.png'
    pio.write_image(fig, image_file_path)




In [19]:
# Create plots for the third batch of attributes against 'Target'
for i in range(2 * batch_size, 3 * batch_size):
    if i >= len(columns_to_plot):
        break
    column = columns_to_plot[i]
    fig = px.histogram(train_df, x=column, color='Target', title=f'{column} vs Target')
    
    # Save the figure as an image
    image_file_path = f'{image_directory}{column}_vs_Target.png'
    pio.write_image(fig, image_file_path)



In [20]:
# Create plots for the fourth batch of attributes against 'Target'
for i in range(3 * batch_size, len(columns_to_plot)):
    column = columns_to_plot[i]
    fig = px.histogram(train_df, x=column, color='Target', title=f'{column} vs Target')
    # Save the figure as an image
    image_file_path = f'{image_directory}{column}_vs_Target.png'
    pio.write_image(fig, image_file_path)



### The images are saved in a file as they are heavy files and slowing down the notebook.
---


# What we infer from the graphs 

- `txn_61`
  - 1871 points have target 1
  - Value 0 -> Always
- `txn_62`
  - Value 0 -> 1866 have target 1
  - Value 1 -> 696 have target 0, very few have target 1
- `txn_63`
  - Value 0 -> 1867 have target 1
  - At other datapoints very less values of both targets
- `txn_64`
  - Value 0 -> Many mules , target value 1
  - Mules happen at single digit values, but it takes values till 200.
- `txn_65`
  - Value -> Always 0
  - 1871 have target 1
- `txn_66`
  - 1800s have target 1
  - Value 0 -> Always
- `txn_67`
  - Target 1 at Values 0 like before
- `txn_68`
  - Wide range of values from 0 - 17000
  - Value 0 -> 1800s mules
  - Few mules , max till value till 700.
- `txn_69`
  - Value 0 -> 1200s
  - Value 1 -> 500s
  - Value 2 -> 150s
  - Value 3 -> 70s
  - Mules till 15.
- `txn_70`
  - Value only 0, 1800s target 1
- `txn_73`
  - Different & Visible pattern
  - Decreasing graph
- `txn_74`
  - Different pattern
  - Like txn_73
- `txn_75`
  - Different pattern like txn_74
- `txn_76`
  - Different and like txn_75
- `txn_77`
  - Slightly higher peak at zero, like txn_76
  - Different pattern, slight right skewness
- `txn_78`
  - Like `txn_77`
- `txn_79`
  - Like txn_78

`txn_73` :- `txn_80` ,`txn_81`(Peak spikes), 
  

- `others_1`
  - Value 0 -> 1700s Mules
  - Value 1 -> 200s Mules
- `others_3`
  - Proportion of mules at Value 1 more
- `others_6`
  - Different pattern
  - Number of Mules at lower values are higher in comparison to later values.
- `others_7`
  - Same like other_6, looks important.
- `others_8`
  - Mules at lower values only.
- `others_9`
  - Looks important, like others_6.
- `others_10`
  - Looks like others_6
- `others_12`
  - Looks like others_6
- `others_11`
  - Like others_8

`others_6` :- `others_13` ,`others_15` , `others_16`  
`others_8` :- `others_14` , `others_17` , `others_18` , `others_19`, `others_20` ,`others_21` ,`others_22` ,`others_24`, `others_25` ,`others_23` , `others_26` ,`others_29`,`others_30`,`others_31`,`others_32`,`others_37`



- `others_36`
  - Different pattern, mules at higher values.


`email_domain` :No pattern as such

`demog_40` , `demog_43`: Categorical , interesting, column . High value imples more mules.
`others_42` : Mules at lower values.



- `Target`
Only 2% of our targets are 1, such low number of mules banks makes sense

- `Dates`
Dates have been given last 4 months of financial year 2022-23. There is slight rise in mules in later part of January

- `Country Code`
Most of the people are from India(above 90%) and all mules are Indians

- `Demog_2`
There is a significant rise in Mule values around 2 and 3 

- `Income`
There is a trend of more mules with lower income values.

- `City-Tier`
Higher number of mules from Rural area, and significantly less from Tier-1

- `Occupation`
Saliered and Student have very low percentage of mules, Self-Employed have significantly high number of mules.

- `Demog_4`
Most frauders have N value but 90% of values are itself N.

- `Demog_9`
Higher percent of values from -0.5 to 0.5

- `Deomog_13`
Higher percent of values from -0.5 to 0.5

- `Deomog_14`
Higher percent of values from -0.5 to 0.5

- `Demog_20`
Significant drop from 0.5 to 1.5

- `Demog_21`
Significant drop from 0.5 to 1.5

- `Demog_22`
All mules in Y, but very low number of N in total

- `os`
Significant mules in Android, only 0.8% of IoS are mules

- `tx_1-15`
Most values are 0 itself(distribution is also very similar expect very few outliers)

- `tx_53`
Still mules around 0 and 1 values but different from general pattern, mules are more on 0 than 1.

- `tx_54`
High number of values at 2 but very low mules.

- `demog_23` , `demog_32`
Lower values have higher mules.

