SUPERVISED REGRESSION


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import zipfile
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

In [90]:
# Load data
with zipfile.ZipFile('production-quality.zip', 'r') as zip_ref:
    zip_ref.extractall('production-quality')

In [91]:
# Read data
X_init = pd.read_csv("production-quality/data_X.csv")
submission = pd.read_csv("production-quality/sample_submission.csv")
Y_init = pd.read_csv("production-quality/data_Y.csv")
submission.tail(1)
Y_init.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29184 entries, 0 to 29183
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   date_time  29184 non-null  object
 1   quality    29184 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 456.1+ KB


In [92]:
# Convert to date_time format
X_init['date_time']=pd.to_datetime(X_init['date_time'])
Y_init['date_time']=pd.to_datetime(Y_init['date_time'])
submission['date_time']=pd.to_datetime(submission['date_time'])

In [93]:
# extracting only the date and hour
X_init['date_hour']=X_init['date_time'].dt.strftime('%Y-%m-%d %H')
X_init=X_init.drop(columns=['date_time'])

         T_data_1_1  T_data_1_2  T_data_1_3  T_data_2_1  T_data_2_2  \
2103831         271         253         265         353         359   
2103832         271         254         265         353         359   
2103833         271         256         265         353         359   
2103834         271         258         265         353         359   
2103835         271         259         265         353         359   
2103836         271         261         265         353         359   
2103837         271         261         265         353         359   
2103838         271         261         265         353         359   
2103839         271         261         265         353         359   
2103840         271         261         265         353         359   

         T_data_2_3  T_data_3_1  T_data_3_2  T_data_3_3  T_data_4_1  \
2103831         353         481         449         491         325   
2103832         353         481         449         491         325   
21038

In [94]:
# Grouping the data by date_hour and AH_data and aggregating it into lists
X_aggregated = X_init.groupby(['date_hour', 'AH_data']).agg(lambda x: list(x)).reset_index()
print(X_aggregated.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35065 entries, 0 to 35064
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date_hour   35065 non-null  object 
 1   AH_data     35065 non-null  float64
 2   T_data_1_1  35065 non-null  object 
 3   T_data_1_2  35065 non-null  object 
 4   T_data_1_3  35065 non-null  object 
 5   T_data_2_1  35065 non-null  object 
 6   T_data_2_2  35065 non-null  object 
 7   T_data_2_3  35065 non-null  object 
 8   T_data_3_1  35065 non-null  object 
 9   T_data_3_2  35065 non-null  object 
 10  T_data_3_3  35065 non-null  object 
 11  T_data_4_1  35065 non-null  object 
 12  T_data_4_2  35065 non-null  object 
 13  T_data_4_3  35065 non-null  object 
 14  T_data_5_1  35065 non-null  object 
 15  T_data_5_2  35065 non-null  object 
 16  T_data_5_3  35065 non-null  object 
 17  H_data      35065 non-null  object 
dtypes: float64(1), object(17)
memory usage: 4.8+ MB
None


In [95]:
# Expands the columns that has lists into seperate columns
for col in X_aggregated.columns:
    if X_aggregated[col].apply(lambda x: isinstance(x, list)).any():
        # Expanding only the rows with lists into separate columns
        expanded_cols = X_aggregated[col].apply(lambda x: x if isinstance(x, list) else [x]).apply(pd.Series)
        # Each new column is named with a suffix indicating position in the list
        expanded_cols.columns = [f"{col}_{i+1}" for i in range(expanded_cols.shape[1])]
        # Concatenate expanded columns with the original DataFrame
        X_aggregated = pd.concat([X_aggregated.drop(columns=[col]), expanded_cols], axis=1)

In [96]:
# Subtract 1 hour ( Y has produced the quality result of the work done in previous hour, so we shift Y to match the values of that hour)
Y_init['date_time'] = Y_init['date_time'] - pd.Timedelta(hours=1)
# Converting into date hour format
Y_init['date_hour'] = Y_init['date_time'].dt.strftime('%Y-%m-%d %H')
Y_init = Y_init.drop(columns=['date_time'])
submission['date_hour'] = submission['date_time'].dt.strftime('%Y-%m-%d %H')
submission_file = submission.drop(columns=['date_time'])
# Step 4: Check the resulting DataFrame
Y_init.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29184 entries, 0 to 29183
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   quality    29184 non-null  int64 
 1   date_hour  29184 non-null  object
dtypes: int64(1), object(1)
memory usage: 456.1+ KB


In [97]:
# Merge X and Y - Inner 
data = pd.merge(X_aggregated, Y_init, on='date_hour', how='inner')
data['date_hour'] = pd.to_datetime(data['date_hour'], format='%Y-%m-%d %H')
data['year'] = data['date_hour'].dt.year
data['month'] = data['date_hour'].dt.month
data['day'] = data['date_hour'].dt.day
data['hour'] = data['date_hour'].dt.hour
data = data.drop(columns=['date_hour'])

# Merge X and submission file
validation = pd.merge(X_aggregated, submission[["date_hour","quality"]], left_on="date_hour", right_on="date_hour", how='inner')
validation['date_hour'] = pd.to_datetime(validation['date_hour'], format='%Y-%m-%d %H')
validation['year'] = validation['date_hour'].dt.year
validation['month'] = validation['date_hour'].dt.month
validation['day'] = validation['date_hour'].dt.day
validation['hour'] = validation['date_hour'].dt.hour
validation = validation.drop(columns=['date_hour'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29184 entries, 0 to 29183
Columns: 966 entries, AH_data to hour
dtypes: float64(961), int32(4), int64(1)
memory usage: 214.6 MB
AH_data         0
T_data_1_1_1    0
T_data_1_1_2    0
T_data_1_1_3    0
T_data_1_1_4    0
               ..
quality         0
year            0
month           0
day             0
hour            0
Length: 966, dtype: int64


PCA - To reduce the high dimensional dataset

In [129]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

X=data.drop(columns=['quality'])
y=data['quality']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

MODEL TRAINING

In [130]:
X=data.drop(columns=['quality'])
y=data['quality']
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)
models = {
    #'linear': LinearRegression(),
    #'Ridge': Ridge(),
    #'Lasso': Lasso(),
    'RandomForest': RandomForestRegressor(n_estimators=200, random_state=42),
    #'GradientBoosting': GradientBoostingRegressor(n_estimators=200, random_state=42),
    #'Support Vector Machine (SVM)': SVR(),
    #'Decision Tree': DecisionTreeRegressor(random_state=42),
    'XGBoost': XGBRegressor(n_estimators=200, random_state=42)
}

In [131]:
model_scores={}
print("Model Performance Comparison (Mean Absolute Error):")
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = mean_absolute_error(y_test, y_pred)
    model_scores[model_name] = score
    print(f"{model_name} MSE : {score}")
    
best_model_name = min(model_scores, key=model_scores.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name}\n ")

Model Performance Comparison (Mean Absolute Error):
RandomForest MSE : 7.6849777282850775
XGBoost MSE : 7.83484349161749

Best Model: RandomForest
 


In [132]:
val = validation.drop(columns=['quality'])
val_scaled = scaler.transform(val)
val_pca = pca.transform(val_scaled)
quality=best_model.predict(val_scaled)
submission['quality']=quality

In [133]:
submission = submission[["date_time","quality"]]
submission

Unnamed: 0,date_time,quality
0,2018-05-04 00:05:00,440.875
1,2018-05-04 01:05:00,435.670
2,2018-05-04 02:05:00,407.890
3,2018-05-04 03:05:00,408.670
4,2018-05-04 04:05:00,413.140
...,...,...
5803,2018-12-31 19:05:00,472.105
5804,2018-12-31 20:05:00,453.770
5805,2018-12-31 21:05:00,443.870
5806,2018-12-31 22:05:00,439.100
