In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import QuantileTransformer
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
#train_ex = pd.read_csv('training_extra.csv')

In [None]:
# prices larger than 150 are capped at 150
# bunch of nan values

In [None]:
def transform(df):
    df = df.copy(deep=True)
    df['Size'] = df['Size'].fillna('Medium') #assume medium to be mean
    size_mapping = {'Small': 0, 'Medium': 1, 'Large': 2}
    
    df['Size'] = df['Size'].map(size_mapping)

    df['Weight Capacity (kg)'] = df['Weight Capacity (kg)'].fillna(df['Weight Capacity (kg)'].mean())
    

    dummies = pd.get_dummies(df[['Compartments', 'Material', 'Brand','Style', 'Color','Laptop Compartment','Waterproof']], dummy_na=True)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(['Compartments', 'Material', 'Brand','Style', 'Color','Laptop Compartment','Waterproof'], axis=1)

    return df


def transform2(df):
    df = df.copy(deep=True)
    df['Size'] = df['Size'].fillna('Medium') #assume medium to be mean
    size_mapping = {'Small': 0, 'Medium': 1, 'Large': 2}
    
    df['Size'] = df['Size'].map(size_mapping)

    df['Weight Capacity (kg)'] = df['Weight Capacity (kg)'].fillna(df['Weight Capacity (kg)'].mean())
    

    dummies = pd.get_dummies(df[['Compartments', 'Material', 'Brand','Style', 'Color','Laptop Compartment','Waterproof']], dummy_na=True)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(['id','Compartments', 'Material', 'Brand','Style', 'Color','Laptop Compartment','Waterproof'], axis=1)

    return df

def quantile_transform(df):
    quantile = QuantileTransformer(output_distribution='normal', quantiles=[i/10 for i in range(10+1)])
    return quantile.fit_transform(df[['Price']]), quantile
def reverse_quantile_transform(pred, quantile):
    return quantile.inverse_transform(pred.reshape(-1,1))

from sklearn.preprocessing import KBinsDiscretizer

def discretize_transform(df):
    discretizer = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
    return discretizer.fit_transform(df[['Price']]), discretizer

def reverse_discretize_transform(pred, discretizer): 
    return discretizer.inverse_transform(pred.reshape(-1,1))

In [None]:
trainT = transform(train)
#trainT = trainT.loc[(trainT['Price'] <= 149)&(trainT['Price'] >= 16)] #remove prices larger than 150
# try1, without large values lgb and linear regression just predict mean
testT = transform(test)

trainT['TPrice'], quantile = discretize_transform(trainT)
target = 'TPrice'

In [None]:
trainT.TPrice.hist(bins=100)

# mean 

In [None]:
test['Price'] = train['Price'].mean()
test[['id', 'Price']].to_csv('mean.csv', index=False)

## data analytics
- nothing obvious

In [None]:
corrMat = trainT.corr()
corrMat['Price'].sort_values(ascending=False)

In [None]:
train.dtypes

In [None]:
# Iterate over each feature in the train dataframe
non_categorical_features = train.select_dtypes(exclude=['object', 'category']).columns

for feature in non_categorical_features:
    # Create a scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(train[feature], train['Price'], alpha=0.005)
    plt.title(f'Scatter Plot of {feature} vs Price')
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.show()

In [None]:
import seaborn as sns

In [None]:
# Select only categorical features
categorical_features = train.select_dtypes(include=['object', 'category']).columns

# Iterate over each categorical feature
for feature in categorical_features:
    # Create a violin plot
    plt.figure(figsize=(8, 6))
    sns.violinplot(x=feature, y='Price', data=train, split=True)
    plt.title(f'Violin Plot of {feature} vs Price')
    plt.xlabel(feature)
    plt.ylabel('Price')
    plt.show()

## target transformation
- quantile transformation

In [None]:
train.loc[train['Price'] < 16,'Price'].hist(bins=100)

In [None]:
np.sqrt(train['Price']).hist(bins=100) 
#log, sqrt, .. nothing really nice

In [None]:
from sklearn.preprocessing import QuantileTransformer

# Assuming 'y' is your target variable
quantile = QuantileTransformer(output_distribution='normal')
train['price_quantile'] = quantile.fit_transform(train[['Price']])

train['price_quantile'].hist(bins=100)

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []
models = {}
for i, (train_index, test_index) in enumerate(kf.split(trainT)):
    X_train, X_test = trainT.drop('Price', axis=1).iloc[train_index], trainT.drop('Price', axis=1).iloc[test_index]
    y_train, y_test = trainT['Price'].iloc[train_index], trainT['Price'].iloc[test_index]
    

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    models[f'model_{i}'] = model
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

print("RMSE scores for each fold:", rmse_scores)
print("Average RMSE across all folds:", np.mean(rmse_scores))

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []
models = {}
for i, (train_index, test_index) in enumerate(kf.split(trainT)):
    X_train, X_test = trainT.drop('Price', axis=1).iloc[train_index], trainT.drop('Price', axis=1).iloc[test_index]
    y_train, y_test = trainT['Price'].iloc[train_index], trainT['Price'].iloc[test_index]
    

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    models[f'model_{i}'] = model
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

print("RMSE scores for each fold:", rmse_scores)
print("Average RMSE across all folds:", np.mean(rmse_scores))

# linear regression

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []
models = {}
for i, (train_index, test_index) in enumerate(kf.split(trainT)):
    X_train, X_test = trainT.drop('Price', axis=1).iloc[train_index], trainT.drop('Price', axis=1).iloc[test_index]
    y_train, y_test = trainT['Price'].iloc[train_index], trainT['Price'].iloc[test_index]
    

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    models[f'model_{i}'] = model
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)

print("RMSE scores for each fold:", rmse_scores)
print("Average RMSE across all folds:", np.mean(rmse_scores))

In [None]:
test_predictions = {}
for model_name, model in models.items():
    y_pred = model.predict(testT)
    test_predictions[model_name] = y_pred

test_predictions_df = pd.DataFrame(test_predictions)

In [None]:
test_predictions_df['mean'] = test_predictions_df.mean(axis=1)
test_predictions_df['mean'].to_csv('test_predictions.csv', index=False)

In [None]:
# Get the coefficients
coefficients = model.coef_

# Create a dataframe with the coefficients and feature names
importance_df = pd.DataFrame({'feature': X_train.columns, 'coefficient': coefficients})

# Sort the dataframe by absolute coefficient value
importance_df = importance_df.sort_values(by='coefficient', ascending=False)

# Print the top 5 most important features
importance_df

# linear regression quantile transformation

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []
rmse_scores_original = []
models = {}
results = []
for i, (train_index, test_index) in enumerate(kf.split(trainT)):
    X_train, X_test = trainT.drop(['Price', 'TPrice'], axis=1).iloc[train_index], trainT.drop(['Price', 'TPrice'], axis=1).iloc[test_index]
    y_train, y_test = trainT['TPrice'].iloc[train_index], trainT['TPrice'].iloc[test_index]
    

    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    models[f'model_{i}'] = model
    y_predO = reverse_quantile_transform(model.predict(X_test), quantile).flatten()
    y_testO = trainT['Price'].iloc[test_index]
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    rmseO = np.sqrt(mean_squared_error(y_testO, y_predO))
    rmse_scores_original.append(rmseO)

    # Log results
    res = pd.DataFrame({'y_testT': y_test, 'y_predT': y_pred, 'y_testO': y_testO, 'y_predO': y_predO})
    results.append(res)

results = pd.concat(results, ignore_index=True)
print("RMSE scores for each fold:", rmse_scores)
print("Average RMSE across all folds:", np.mean(rmse_scores))
print("RMSE scores for each fold original:", rmse_scores_original)
print("Average RMSE across all folds original:", np.mean(rmse_scores_original))

In [None]:
results[['y_testT','y_predT']].hist(bins=100)

In [None]:
# Plot the histograms
plt.hist(results['y_predT'], bins=50, alpha=0.5, label='Prediction')
plt.hist(results['y_testT'], bins=50, alpha=0.5, label='Ground Truth')

# Set the title and labels
plt.title('Histograms of Predictions and Ground Truths')
plt.xlabel('Value')
plt.ylabel('Frequency')

# Add a legend
plt.legend()

# Show the plot
plt.show()

In [None]:
test_predictions = {}
for model_name, model in models.items():
    y_pred = model.predict(testT)
    y_predo = reverse_quantile_transform(model.predict(testT), quantile).flatten()
    test_predictions[model_name] = y_predo

test_predictions_df = pd.DataFrame(test_predictions)

test_predictions_df['mean'] = test_predictions_df.mean(axis=1)
test_predictions_df[['mean']].hist(bins=100)

In [None]:
test_predictions_df['Price'] = test_predictions_df['mean']
test_predictions_df['id'] = test['id']
test_predictions_df[['id', 'Price']].to_csv('test_predictions.csv', index=False)

# lgb regressor

In [None]:
import lightgbm as lgb

In [None]:
models = {}

# Train and store each model
for i, (train_index, test_index) in enumerate(kf.split(trainT)):
    X_train, X_test = trainT.drop('Price', axis=1).iloc[train_index], trainT.drop('Price', axis=1).iloc[test_index]
    y_train, y_test = trainT['Price'].iloc[train_index], trainT['Price'].iloc[test_index]
    
    model = lgb.LGBMRegressor(objective='regression_l2', random_state=42)
    model.fit(X_train, y_train)
    models[f'model_{i}'] = model
    y_pred = model.predict(X_test)
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
print("RMSE scores for each fold:", rmse_scores)
print("Average RMSE across all folds:", np.mean(rmse_scores))

In [None]:
test_predictions = {}
for model_name, model in models.items():
    y_pred = model.predict(testT)
    test_predictions[model_name] = y_pred

test_predictions_df = pd.DataFrame(test_predictions)
test_predictions_df['mean'] = test_predictions_df.mean(axis=1)

In [None]:
test_predictions_df['mean'].hist(bins=100)

# lgb regressor quantile transf

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

rmse_scores = []
rmse_scores_original = []
models = {}
results = []
for i, (train_index, test_index) in enumerate(kf.split(trainT)):
    X_train, X_test = trainT.drop(['Price', 'TPrice'], axis=1).iloc[train_index], trainT.drop(['Price', 'TPrice'], axis=1).iloc[test_index]
    y_train, y_test = trainT['TPrice'].iloc[train_index], trainT['TPrice'].iloc[test_index]
    

    model = lgb.LGBMRegressor(objective='regression_l2', random_state=42)
    model.fit(X_train, y_train)
    models[f'model_{i}'] = model
    y_pred = model.predict(X_test)
    
    models[f'model_{i}'] = model
    y_predO = reverse_quantile_transform(model.predict(X_test), quantile).flatten()
    y_testO = trainT['Price'].iloc[test_index]
    
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    rmse_scores.append(rmse)
    rmseO = np.sqrt(mean_squared_error(y_testO, y_predO))
    rmse_scores_original.append(rmseO)

    # Log results
    res = pd.DataFrame({'y_testT': y_test, 'y_predT': y_pred, 'y_testO': y_testO, 'y_predO': y_predO})
    results.append(res)

results = pd.concat(results, ignore_index=True)
print("RMSE scores for each fold:", rmse_scores)
print("Average RMSE across all folds:", np.mean(rmse_scores))
print("RMSE scores for each fold original:", rmse_scores_original)
print("Average RMSE across all folds original:", np.mean(rmse_scores_original))

In [None]:
results[['y_testT','y_predT']].hist(bins=100)

In [None]:
results[['y_testO','y_predO']].hist(bins=100)

# predict price of 150 / using discretized output

In [None]:
#trainT['target'] = trainT['Price']> 149
target = 'TPrice'
trainT[target].value_counts()


In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(trainT.drop(['Price', 'TPrice'], axis=1), trainT['TPrice'], test_size=0.2, random_state=42)

# Create a LightGBM multiclass classifier
train_data = lgb.Dataset(X_train, label=y_train)
params = {'objective': 'multiclass', 'num_class': len(np.unique(y_train)), 'metric': 'multi_logloss', 'boosting_type': 'gbdt', 'num_leaves': 31, 'learning_rate': 0.05}
model = lgb.train(params, train_data, num_boost_round=100)

# Make predictions on the testing set
y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

# Evaluate the model
accuracy = (y_pred == y_test).mean()
print('Accuracy:', accuracy)