### Import the packages
Note: main documentation for SweetViz can be found on: https://pypi.org/project/sweetviz/

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv
import os
import fastparquet
# import dask.dataframe as dd
# import feather

### Load the data

In [2]:
train = pd.read_parquet('data/train.parquet')

### Change column headers

In [3]:
# column headers
train.columns = train.columns.str.lower()
train.columns = train.columns.str.replace(' ', '_')

### Map labels to training data and add to train

In [4]:
train_label = pd.read_csv('data/train_labels.csv')
my_dict = train_label.set_index('customer_ID').to_dict()['target']
train['target'] = train['customer_id'].map(my_dict)

### Create samples of train data

In [5]:
# frac =.01 = 1/100 deel
df_train_sample_1 = train.sample(frac =.01, random_state=1)

### Check some basic elements of the train and test data

In [17]:
df_train_sample_1.shape

(55315, 191)

### Save the sample from train (55315 records)

In [21]:
df_train_sample_1.to_csv('train_sample_large.csv', index=False)

### Remove duplicates in the customer Id column (optional)

In [7]:
num_duplicates = df_train_sample_1.duplicated(subset=['customer_id']).sum()
print("Number of duplicate values in 'customer_id' column:", num_duplicates)

Number of duplicate values in 'customer_id' column: 3118


In [8]:
df_train_sample_1 = df_train_sample_1.drop_duplicates(subset=['customer_id'])

### Save df_train_sample_1

In [13]:
df_train_sample_1.to_csv('df_train_sample.csv', index=False)

### Load a smaller dataset (5000 rows)

In [7]:
train_sample_small = pd.read_csv('data\df_sample_500.csv')

### Create a SweetViz Report of the Sample Training Set

In [None]:
# Analyze the DataFrame with Sweetviz
report = sv.analyze(df_train_sample_1, target_feat='target')

In [19]:
report.show_html("Report_large.html")

Report Report_large.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
---
(likely due to only having a single row, containing non-NaN values for both correlated features)
Affected correlations:['d_73/d_134', 'd_88/d_134', 'd_134/d_73', 'd_134/d_88']


### Other use cases of SweetViz (not used)
https://pypi.org/project/sweetviz/

In [None]:
my_report = sv.analyze(my_dataframe)
my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

# analyze arguments:
# analyze(source: Union[pd.DataFrame, Tuple[pd.DataFrame, str]],
#             target_feat: str = None,
#             feat_cfg: FeatureConfig = None,
#             pairwise_analysis: str = 'auto',
#             verbosity: str = 'default'):
feature_config = sv.FeatureConfig(skip="PassengerId", force_text=["Age"])
# 

# Comparing two dataframes (e.g. Test vs Training sets)
my_report = sv.compare([my_dataframe, "Training Data"], [test_df, "Test Data"], "Survived", feature_config)

# Comparing two features
my_report = sv.compare_intra(my_dataframe, my_dataframe["Sex"] == "male", ["Male", "Female"], "Survived", feature_config)

# Show:
show_html(  filepath='SWEETVIZ_REPORT.html', 
            open_browser=True, 
            layout='widescreen', 
            scale=None)

show_notebook(  w=None, 
                h=None, 
                scale=None,
                layout='widescreen',
                filepath=None,
                file_layout=None,
                file_scale=None)

### Reduce dimensionality (leave only 38 columns)

In [35]:
keep = ['customer_id','r_4','r_2','b_33','r_13','b_30','b_22','r_10','r_24','r_21','d_87','s_20','b_38','b_33','b_22','b_30','d_51','r_2','r_10','p_2','d_48','d_61','b_18','b_9','b_2','d_75','d_55','d_58','d_44','b_3','b_7','b_23','d_74','b_8','r_3','p_4','d_112','r_16', 'target']

# Reduce DataFrame to include only the columns in the keep list
df_train_sample_1_reduced = df_train_sample_1.loc[:, keep]

In [34]:
df_train_sample_1_reduced.shape

(55315, 38)

In [31]:
df_train_sample_1_reduced.isna().sum()

customer_id       0
r_4               0
r_2               0
b_33              0
r_13              0
b_30              0
b_22              0
r_10              0
r_24              0
r_21              0
d_87              0
s_20              0
b_38              0
b_33              0
b_22              0
b_30              0
d_51              0
r_2               0
r_10              0
p_2             443
d_48           7207
d_61           6052
b_18              0
b_9               0
b_2              21
d_75              0
d_55           1779
d_58              0
d_44              0
b_3              21
b_7               0
b_23              0
d_74              0
b_8             232
r_3               0
p_4               0
d_112            25
r_16              0
target            0
dtype: int64

In [32]:
df_train_sample_1_reduced.to_csv('train_reduced.csv', index=False)