##### Imports

In [1]:
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
from sdv.single_table import CTGANSynthesizer
from sdv.evaluation.single_table import run_diagnostic, evaluate_quality, get_column_plot, get_column_pair_plot
from sdv.metadata import SingleTableMetadata

##### Load dataset

In [2]:
data = pd.read_csv("data/merged_df_file_rutgers.csv")

In [3]:
data.head()

Unnamed: 0,serverTimestamp,day_part_x,user_id,numberRating,highestRating,lowestRating,medianRating,sdRating,numberLowRating,numberMediumRating,numberHighRating,numberMessageReceived,numberMessageRead,readAllMessage,reward,timestamp,day_part_y,action,message,day_part_numeric
0,10/6/2020,0,1BIGBILHOT,0,0,0,0.0,0.0,0,0,0,0,0,0,0.0,01:03.0,morning,0.0,No message was sent!,0.0
1,10/6/2020,1,1BIGBILHOT,0,0,0,0.0,0.0,0,0,0,1,1,1,0.5,01:04.3,afternoon,2.0,Did you forget what pleasant activities to do?...,1.0
2,10/6/2020,2,1BIGBILHOT,0,0,0,0.0,0.0,0,0,0,2,2,1,0.5,01:03.7,evening,3.0,"Many people sometimes feel sad, this is nothin...",2.0
3,10/7/2020,0,1BIGBILHOT,0,0,0,0.0,0.0,0,0,0,1,1,1,0.5,01:03.6,morning,3.0,Even if you don’t rate your mood at some point...,0.0
4,10/7/2020,1,1BIGBILHOT,0,0,0,0.0,0.0,0,0,0,2,2,1,0.5,01:03.7,afternoon,3.0,"Many people sometimes feel sad, this is nothin...",1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   serverTimestamp        9564 non-null   object 
 1   day_part_x             9564 non-null   int64  
 2   user_id                9564 non-null   object 
 3   numberRating           9564 non-null   int64  
 4   highestRating          9564 non-null   int64  
 5   lowestRating           9564 non-null   int64  
 6   medianRating           9564 non-null   float64
 7   sdRating               9564 non-null   float64
 8   numberLowRating        9564 non-null   int64  
 9   numberMediumRating     9564 non-null   int64  
 10  numberHighRating       9564 non-null   int64  
 11  numberMessageReceived  9564 non-null   int64  
 12  numberMessageRead      9564 non-null   int64  
 13  readAllMessage         9564 non-null   int64  
 14  reward                 9564 non-null   float64
 15  time

##### Data Cleaning (need to double check)

In [5]:
def process_csv(df):
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]): # Handle numerical missing values (mean imputation)
            if df[col].isnull().any():
                df[col].fillna(df[col].mean(), inplace=True)
        else: # Handle missing values in categorical or other object columns
            if pd.api.types.is_object_dtype(df[col]):
                if df[col].isnull().any():
                    df[col].fillna(df[col].mode()[0], inplace=True)  # Impute with mode for categorical data
    return df

clean_data = process_csv(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)  # Impute with mode for categorical data
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [6]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9564 entries, 0 to 9563
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   serverTimestamp        9564 non-null   object 
 1   day_part_x             9564 non-null   int64  
 2   user_id                9564 non-null   object 
 3   numberRating           9564 non-null   int64  
 4   highestRating          9564 non-null   int64  
 5   lowestRating           9564 non-null   int64  
 6   medianRating           9564 non-null   float64
 7   sdRating               9564 non-null   float64
 8   numberLowRating        9564 non-null   int64  
 9   numberMediumRating     9564 non-null   int64  
 10  numberHighRating       9564 non-null   int64  
 11  numberMessageReceived  9564 non-null   int64  
 12  numberMessageRead      9564 non-null   int64  
 13  readAllMessage         9564 non-null   int64  
 14  reward                 9564 non-null   float64
 15  time

##### Metadata Extraction

In [7]:
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(clean_data)

##### Model Initialization

In [8]:
# Initialize the model
synthesizer = CTGANSynthesizer(
    metadata=metadata,
    enforce_rounding=False,
    epochs=100,
    verbose=True)
# Fit the synthesizer to real data
synthesizer.fit(clean_data)

  from .autonotebook import tqdm as notebook_tqdm
Gen. (-0.99) | Discrim. (-0.41): 100%|██████████| 100/100 [04:26<00:00,  2.67s/it]


In [9]:
# Generate synthetic data
synthetic_data = synthesizer.sample(len(clean_data))

In [10]:
# Save synthetic data to CSV file
os.makedirs(os.path.dirname("data/gen_data.csv"), exist_ok=True)
synthetic_data.to_csv("data/gen_data.csv", index=False)

In [11]:
# Run diagnostic and evaluate quality
diagnostic = run_diagnostic(real_data=clean_data, synthetic_data=synthetic_data, metadata=metadata)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 20/20 [00:00<00:00, 1999.19it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 1001.03it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [12]:
quality_report = evaluate_quality(real_data=clean_data, synthetic_data=synthetic_data, metadata=metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 20/20 [00:00<00:00, 476.15it/s]|
Column Shapes Score: 70.39%

(2/2) Evaluating Column Pair Trends: |██████████| 190/190 [00:02<00:00, 89.51it/s]| 
Column Pair Trends Score: 73.32%

Overall Score (Average): 71.85%



In [13]:
'''column_plot = get_column_plot(real_data=clean_data, synthetic_data=synthetic_data, column_name="user_id", metadata=metadata)
column_pair_plot = get_column_pair_plot(real_data=clean_data, synthetic_data=synthetic_data, column_names=["numberMessageReceived", "numberMessageRead"], metadata=metadata)

column_plot.show()
column_pair_plot.show()'''