In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Dropout,TimeDistributed
import glob
import os
import pyarrow as pa
import pyarrow.parquet as pq


In [9]:
def preprocess_data(file_path):
    df=pd.read_parquet(file_path)
    sum=df.isnull().any().sum()
    return sum

In [12]:
def usefulFeatures():
    train_dir = "data/train.parquet"
    df_useful = pd.read_csv("data/useful_columns.csv")

    null_counts = {col: 0 for col in df_useful.columns}
    non_null_counts = {col: 0 for col in df_useful.columns}
    null_percentages = {col: 0.0 for col in df_useful.columns}
    for file in os.listdir(train_dir):
        df_file = os.path.join(train_dir, file, "part-0.parquet")
        if not os.path.exists(df_file):
            continue

        current_df = pd.read_parquet(df_file)
        for useful_column in df_useful.columns:
            if useful_column in current_df.columns:
                null_value_count = current_df[useful_column].isnull().sum()
                non_null_value_count = current_df[useful_column].notnull().sum()

                null_counts[useful_column] += null_value_count
                non_null_counts[useful_column] += non_null_value_count

    # Calculate the percentage of null values
    total_rows = sum(non_null_counts.values()) + sum(null_counts.values())
    for col in df_useful.columns:
        if col in null_counts:
            null_percentages[col] = (null_counts[col] / total_rows) * 100

    # Add the new rows to the DataFrame
    df_useful.loc["Not Null Count"] = non_null_counts
    df_useful.loc["Null Percentage"] = null_percentages

    # Save the updated DataFrame back to CSV
    df_useful.to_csv("data/useful_columns.csv", index=False)
    print("Null values, non-null counts, and null percentages added.")

In [14]:
def handle_empty_values():
      train_dir = "data/train.parquet"
      df_useful = pd.read_csv("data/useful_columns.csv")
      mean = {col: 0 for col in df_useful.columns}
      sum={col: 0 for col in df_useful.columns}
      total_rows=0
      for file in os.listdir(train_dir):
        df_file = os.path.join(train_dir, file, "part-0.parquet")
        current_df = pd.read_parquet(df_file)
        for useful_column in df_useful.columns:
            if useful_column in current_df.columns:
                sum_col=current_df[useful_column].sum()
                sum[useful_column]+=sum_col
        total_rows+=current_df.shape[0]
      for useful_column in df_useful.columns:
          mean[useful_column]=sum[useful_column]/total_rows
      for file in os.listdir(train_dir):
        df_file = os.path.join(train_dir, file, "part-0.parquet")
        current_df = pd.read_parquet(df_file)
        for useful_column in df_useful.columns:
            if useful_column in current_df.columns:
                current_df[useful_column]=current_df[useful_column].fillna(mean[useful_column])
        current_df.to_parquet(df_file)
      df_useful.loc["Mean"] =mean
      df_useful.to_csv("data/useful_columns.csv",index=False)

      print(mean)

In [23]:
def min_max_scaling():
    train_dir = "data/train.parquet"
    df_useful = pd.read_csv("data/useful_columns.csv")
    
    # Initialize min and max values with extreme numbers
    min_values = {col: float('inf') for col in df_useful.columns}
    max_values = {col: float('-inf') for col in df_useful.columns}
    
    # Step 1: Find global min and max for each column
    for file in os.listdir(train_dir):
        df_file = os.path.join(train_dir, file, "part-0.parquet")
        current_df = pd.read_parquet(df_file)
        for useful_column in df_useful.columns:
            if useful_column in current_df.columns:
                current_min = current_df[useful_column].min()
                current_max = current_df[useful_column].max()
                min_values[useful_column] = min(min_values[useful_column], current_min)
                max_values[useful_column] = max(max_values[useful_column], current_max)
    
    # Step 2: Scale each column in the dataset using Min-Max Scaling
    for file in os.listdir(train_dir):
        new_df_file=os.path.join("data/scaled",file,"part-0.parquet")
        if( not os.path.exists(new_df_file)):
            new_df_dir = os.path.dirname(new_df_file)  # Extract the directory path
            os.mkdir(new_df_dir)
        print(file)
        df_file = os.path.join(train_dir, file, "part-0.parquet")
        current_df = pd.read_parquet(df_file)
        for useful_column in df_useful.columns:
            if useful_column in current_df.columns:
                # Min-Max Scaling formula: (x - min) / (max - min)
                min_val = min_values[useful_column]
                max_val = max_values[useful_column]
                if max_val > min_val:  # Avoid division by zero
                    current_df[useful_column] = (
                        current_df[useful_column] - min_val
                    ) / (max_val - min_val)
                else:
                    # Handle the case where all values are the same
                    current_df[useful_column] = 0
        
        # Save the scaled DataFrame back to the same Parquet file
        current_df.to_parquet(new_df_file)
    
    print("Min-Max Scaling applied successfully!")
    print("Min values:", min_values)
    print("Max values:", max_values)

In [12]:
def calculate_pearson_correlation():
    train_dir = "data/train.parquet"
    df_useful = pd.read_csv("data/useful_columns.csv")
    num = {col: 0 for col in df_useful.columns}
    pearson_correlations={col:0 for col in df_useful.columns}
    deno_xi_Ux_2={col: 0 for col in df_useful.columns}
    deno_yi_Uy_2={col: 0 for col in df_useful.columns}
    for file in os.listdir(train_dir):
        df_file = os.path.join(train_dir, file, "part-0.parquet")
        current_df = pd.read_parquet(df_file)
        for useful_column in df_useful.columns:
            if useful_column in current_df.columns:
                calculated_num_current_column=current_df[useful_column]
                calculated_num_current_column=calculated_num_current_column.to_numpy()
                target_column=current_df["responder_6"].to_numpy()
                calculated_num=np.sum((calculated_num_current_column-df_useful.loc[3,useful_column])*(target_column-df_useful.loc[3,"responder_6"]))
                num[useful_column]+=calculated_num
                deno_xi_Ux_sq=np.sum((calculated_num_current_column-df_useful.loc[3,useful_column])**2)
                deno_yi_Uy_sq=np.sum((target_column-df_useful.loc[3,"responder_6"])**2)
                deno_xi_Ux_2[useful_column]+=deno_xi_Ux_sq
                deno_yi_Uy_2[useful_column]+=deno_yi_Uy_sq
    for useful_column in df_useful:
        denominator=(deno_xi_Ux_2[useful_column] ** 0.5) * (deno_yi_Uy_2[useful_column] ** 0.5)
        if denominator != 0:
            pearson_correlations[useful_column] = num[useful_column] / denominator
        else:
            pearson_correlations[useful_column] = np.nan 
    print(pearson_correlations)

In [None]:
train_dir = "data/train.parquet"
df = pd.DataFrame()
dataframes = []
sum=0
for file in os.listdir(train_dir):
    df_file = os.path.join(train_dir, file, "part-0.parquet")
    sum+=preprocess_data(df_file)
print(sum)

In [None]:
usefulFeatures()

In [None]:
df_useful = pd.read_csv("data/useful_columns.csv")
print(df_useful.loc[3,"responder_6"])

In [None]:
min_max_scaling()

In [7]:

df_train=pd.read_parquet("data/train.parquet/partition_id=0/part-0.parquet")
df=pd.DataFrame(columns=df_train.columns)
df.to_csv("data/useful_columns.csv",index=False)

In [15]:
def data_generator(target_column, batch_size=64, timesteps=60):
    train_dir = "data/train.parquet"
    
    while True:  # Infinite loop to make the generator reusable
        for file in os.listdir(train_dir):
            # Construct full file path
            file_path = os.path.join(train_dir, file)
            
            # Read the Parquet file
            df = pd.read_parquet(file_path)
            
            # Select features and target
            features = df.drop(columns=[target_column]).to_numpy()
            target = df[target_column].to_numpy()
            
            # Scale features (optional, e.g., MinMaxScaler or StandardScaler)
            # scaler = MinMaxScaler()
            # features = scaler.fit_transform(features)
            
            X_batch, y_batch = [], []
            
            # Prepare data in sequences for LSTM
            for i in range(len(features) - timesteps):
                X = features[i:i+timesteps]
                y = target[i+timesteps]
                
                X_batch.append(X)
                y_batch.append(y)
                
                if len(X_batch) == batch_size:
                    yield np.array(X_batch), np.array(y_batch)  # Return a batch
                    X_batch, y_batch = [], []

In [26]:
def create_lstm_model(input_shape):
    model = Sequential([
        LSTM(50,activation="relu", return_sequences=True, input_shape=input_shape),
        LSTM(50,activation='relu',return_sequences=True),
        Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse',metrics=['mae','accuracy'])
    return model


In [None]:
def data_generator(target_column, batch_size, timesteps):
    train_dir = "data/train.parquet"
    
    while True:  # Infinite loop to make the generator reusable
        for file in os.listdir(train_dir):
            # Construct full file path
            file_path = os.path.join(train_dir, file,"part-0.parquet")
            
            # Read the Parquet file
            df = pd.read_parquet(file_path)
            
            # Select features and target
            features = df.drop(columns=[target_column]).to_numpy()
            target = df[target_column].to_numpy()
            
            X_batch, y_batch = [], []
            
            # Prepare data in sequences for LSTM
            for i in range(len(features) - timesteps):
                X = features[i:i + timesteps]
                y = target[i + timesteps]
                
                X_batch.append(X)
                y_batch.append(y)
                
                if len(X_batch) == batch_size:
                    yield np.array(X_batch), np.array(y_batch)  # Return a batch
                    X_batch, y_batch = [], []

# Function to Calculate Steps Per Epoch
def calculate_steps_per_epoch(parquet_dir, timesteps, batch_size):
    total_samples = 0
    for file in os.listdir(parquet_dir):
        file_path = os.path.join(parquet_dir, file,"part-0.parquet")
        total_samples += pq.ParquetFile(file_path).metadata.num_rows - timesteps
    return total_samples // batch_size

# LSTM Model
def create_lstm_model(input_shape):
    model = Sequential([
        LSTM(50, activation="relu", return_sequences=True, input_shape=input_shape),
        LSTM(50, activation="relu"),
        Dense(1)  # Output layer for regression
    ])
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Main Script
if __name__ == "__main__":
    target_column = "responder_6"
    batch_size = 4096
    timesteps = 60
    parquet_dir = "data/train.parquet"
    
    # Determine Input Shape (using the first file)
    sample_file = os.listdir(parquet_dir)[0]
    sample_df = pd.read_parquet(os.path.join(parquet_dir, sample_file))
    input_shape = (timesteps, sample_df.shape[1] - 1)  # Exclude the target column

    # Create the LSTM model
    model = create_lstm_model(input_shape)

    # Calculate steps per epoch
    steps_per_epoch = calculate_steps_per_epoch(parquet_dir, timesteps, batch_size)

    # Initialize data generator
    generator = data_generator(target_column=target_column, batch_size=batch_size, timesteps=timesteps)

    # Train the model
    model.fit(generator, steps_per_epoch=steps_per_epoch, epochs=10)

    # Save the model
    model.save("lstm_model.h5")
    print("Model training complete. Saved as 'lstm_model.h5'.")
