## Run in google colab
<a href="https://colab.research.google.com/github/lmanov1/HeavyMachineryAuction/blob/main/HeavyMachineryAuction.ipynb" target="_blank">         
<img src="https://colab.research.google.com/assets/colab-badge.svg" 
 target="_blank">
 title="Open this file in Google Colab" alt="Colab"/>
</a>

In [None]:
! pip install gdown
! pip install pandas
! pip install seaborn
! pip install numpy
! pip install matplotlib
! pip install scikit-learn
! pip install pathlib


import gdown
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pathlib import Path

def download_from_gdrive(url, filename):
    # Extract the file ID from the URL
    file_id = url.split('/')[-2]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Download the file
    if Path(filename).exists():
        print(f"File '{filename}' already exists. Skipping download.")
    else:
        gdown.download(download_url, filename, quiet=False)
        print(f"File downloaded as: {filename}")

train = 'https://drive.google.com/file/d/1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5/view?usp=drive_link'
valid = 'https://drive.google.com/file/d/1j7x8xhMimKbvW62D-XeDfuRyj9ia636q/view?usp=drive_link'
# Example usage

download_from_gdrive(train, 'train.csv')
download_from_gdrive(valid, 'valid.csv')


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

# Load the data
data = pd.read_csv('train.csv')
#data.info()
#print(data['fiModelSeries'])
data['fiModelDescriptor'] = data['fiModelDesc'].astype('str')
data['fiModelSeries']=data['fiModelSeries'].astype('str')
data['Grouser_Tracks']=data['Grouser_Tracks'].astype('str')
data['Hydraulics_Flow']=data['Grouser_Tracks'].astype('str')

#data['fiModelSeries'].to_csv('fiModelSeries.csv')
# # Split into features and target
X = data.drop('SalePrice', axis=1)  # Assuming 'SalePrice' is the target
y = data['SalePrice']

X_small = X.sample(20000)
y_small = y.loc[X_small.index]

# X_small = X
# y_small = y.loc[X.index]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_small, y_small, test_size=0.2, random_state=42)
#X_train.info()

# # Identify numerical and categorical features
numerical_features = X_train.select_dtypes(include=['float', 'int','bool']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# print("--------------------------------------")
# print(numerical_features)
# for col in numerical_features:
#    print(col, X_train[col].dtype)
#    X_train[col].astype('float')
# print("--------------------------------------")
# print(categorical_features)
# for col in categorical_features:
#    print(col, X_train[col].dtype)
#    X_train[col].astype('str')
print("--------------------------------------")
# # Create transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# # Combine transformers using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# # Create a pipeline for model training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestRegressor(random_state=42))  # Replace with your desired model
])

pipeline.fit(X_train, y_train)

# # Evaluate the model on the test set
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
  

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

def manipulate_data(df):
    # Make a copy of the dataframe to avoid modifying the original data
    df2 = df.copy()    
    # df2['UsageBand'] = df2['UsageBand'].astype('category')
    # df2 = pd.get_dummies(df2, columns=['UsageBand'], prefix='UsageBand')
    
    # handle dates in processing
    df2['Saledate'] = pd.to_datetime(df.saledate)
    #df2 = df2.select_dtypes(['number', 'datetime','bool'])  # drop all categorical variables
            
    # Feature engineering with dates
    df2['SaleYear'] =  df2['Saledate'].dt.year
    df2['SaleMonth'] =  df2['Saledate'].dt.month     
    # Replace values in YearMade column with YearFromSaledate if YearMade is before 1900 or exceeds the sale date
    df2.loc[(df2['YearMade'] <= 1900) | (df2['YearMade'] > df2['SaleYear']), 'YearMade'] = df2['SaleYear']    
    df2['Age'] = df2['Saledate'].dt.year - df2['YearMade']
    
    # for col in ["YearMade", "Age","SaleYear","SaleMonth"]:
    #     print(f"df2[{col}]: max {df2[col].max():.2f} , min {df2[col].min():.2f} , mean {df2[col].mean():.2f} , std {df2[col].std():.2f},  median {df2[col].median():.2f}")

    # machineid is a unique identifier, so we drop it
    # sale date is redundant now, so we'll drop it
    df2 = df2.drop(columns=['saledate'])
    df2 = df2.drop(columns=['Saledate'])
    df2 = df2.drop(columns=['MachineID'])
    # df2 = df2.drop(columns=['ModelID'])  # don't touch this one !

    df2 = df2.set_index('SalesID')  # set the index to the unique identifier
    
    for col in df2.select_dtypes(include=[np.number,np.datetime64]).columns:    
        mean = df2[col].mean()
        df2[col] = df2[col].fillna(mean) # fill missing values with mean         

    # for col in df2.select_dtypes(include=[np.number,np.datetime64]).columns:   # apply filtering only to numerical columns
    #     if col not in ['SalesID', 'SalePrice']:
    #         mean = df2[col].mean()
    #         std_dev = df2[col].std()
    #         median = df2[col].median()
    #         lower_bound = mean - 3 * std_dev
    #         upper_bound = mean + 3 * std_dev

    #         print(f" percent of outliers in {col}: { ((df2[col] < lower_bound) | (df2[col] >= upper_bound)).sum()/df2[col].shape[0]*100:.2f},  min {df2[col].min():.2f}, max {df2[col].max():.2f} , mean {mean:.2f} , std {std_dev:.2f},  median {median:.2f}")
    #         # Normalize the values in place
    #         df2[col] = df2[col].mask(((df2[col] < lower_bound) | (df2[col] > upper_bound)), median)    
    # print ("After 6 sigma filtering ", df2.shape )   
        
    return df2


### MAIN #########

df = pd.read_csv('train.csv')
# Clean data
print ("Initial ", df.shape)
print("-----------------------------------------------------")
df2 = manipulate_data(df)
print("-----------------------------------------------------")
print ("After manipulate_data ", df2.shape )
print("-----------------------------------------------------")
df2 = df2.drop_duplicates()  # drop duplicates 
print("After drop_duplicates ", df2.shape)
print("-----------------------------------------------------")

#print("before 6 sigma ", df2.shape )
# #Remove outliers using 6 sigma method
#for col in df2.select_dtypes(include=[np.number,np.datetime64]).columns:    
#    mean = df2[col].mean()
#    std_dev = df2[col].std()    
#    lower_bound = mean - 3 * std_dev
#    upper_bound = mean + 3 * std_dev
#    df2 = df2[(df2[col] >= lower_bound) & (df2[col] <= upper_bound)]

#print("After 6 sigma ", df2.shape )
print(df2.head()  )  

# Prepare data for training
X = df2.drop(columns='SalePrice')
y = df2['SalePrice']


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

RANDOM_STATE = 42
MAX_DEPTH = 20
ESTIMATORS=170
TEST_SIZE=0.2

###################################################
def RMSE(y_pred, y_true):
    return ((y_pred - y_true) ** 2).mean() ** 0.5


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
X_small = X.sample(20000)
y_small = y.loc[X_small.index]

# X_small = X
# y_small = y.loc[X_small.index]

model = RandomForestRegressor(random_state=RANDOM_STATE, max_depth=MAX_DEPTH)

X_train, X_test, y_train, y_test = train_test_split(X_small, y_small, test_size=TEST_SIZE, random_state=RANDOM_STATE)
# Dropping columns
for col in X_train.columns:
   if X_train[col].nunique() == 1:
        print("Dropping ", col)
        X_train.drop(col,inplace=True,axis=1)  

# Perform one-hot encoding on the train dataset 
cols_For_onehot_encoding = X_train.select_dtypes(include=["object"]).columns
#print("cols_For_onehot_encoding",  cols_For_onehot_encoding , len(cols_For_onehot_encoding))
to_drop = ['fiModelDesc','fiBaseModel','fiSecondaryDesc','fiModelSeries','fiModelDescriptor','fiProductClassDesc','ProductGroupDesc','state']

for column in to_drop:
    X_train = X_train.drop(column   , axis=1)    
    cols_For_onehot_encoding = cols_For_onehot_encoding.drop(column)
print("cols_For_onehot_encoding shortened" ,cols_For_onehot_encoding, len(cols_For_onehot_encoding))

print("Before=====",  X_train.shape ,X_test.shape )

X_train = pd.get_dummies(X_train, columns=cols_For_onehot_encoding , drop_first=True)
# Align the test dataset with the train dataset columns
X_test = pd.get_dummies(X_test, columns=cols_For_onehot_encoding, drop_first=True)

X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

print("After=====",  X_train.shape ,X_test.shape )

X_train.info()
X_train.head()
X_test.to_csv(f'X_test.csv')
X_train.to_csv(f'X_train.csv')

model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


test_rmse = RMSE(y_test_pred, y_test)
train_rmse = RMSE(y_train_pred, y_train)

print(f" Best test RMSE: {test_rmse:.2f} | y_test.std()={y_test.std():.2f} | y_test.mean()={y_test.mean():.2f} ")
print(f" Best train RMSE: {train_rmse:.2f} | y_train.std()={y_train.std():.2f} | y_train.mean()={y_train.mean():.2f}")

# from sklearn.inspection import permutation_importance
# result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1) 
# feature_importances = pd.DataFrame(result.importances_mean, index=X_test.columns, columns=['Permutation Importance'])
# feature_importances['Permutation Importance'] = feature_importances['Permutation Importance'] / feature_importances['Permutation Importance'].sum()
# feature_importances['Feature importance'] = model.feature_importances_
# feature_importances = feature_importances.sort_values(by='Permutation Importance', ascending=False)
# print(feature_importances)


In [None]:
# Predict on the validation set
X_valid = pd.read_csv('valid.csv')
print ("Initial ", X_valid.shape)

print("-----------------------------------------------------")
X_valid = manipulate_data(X_valid)
print("-----------------------------------------------------")
print ("After manipulate_data ", X_valid.shape )
print("-----------------------------------------------------")

# Align the test dataset with the train dataset columns
#X_valid = pd.get_dummies(X_valid, columns=['datasource'], prefix='datasource' , drop_first=True)
X_valid = pd.get_dummies(X_valid, columns=cols_For_onehot_encoding , drop_first=True)
X_valid = X_valid.reindex(columns=X_train.columns, fill_value=0)

print ("After one hot encoding and reindex ", X_valid.shape )
print("-----------------------------------------------------")

y_valid_pred = model.predict(X_valid)  # predict on the validation set             
y_valid_pred = pd.Series(y_valid_pred, index=X_valid.index, name='SalePrice')
y_valid_pred.info()    



In [None]:
from datetime import datetime

f'submission_{datetime.now().isoformat()}'
y_valid_pred.to_csv(f'submission_{datetime.now().isoformat()}.csv')

In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# #print(df2.corr())
# # Create a figure with a custom size
# plt.figure(figsize=(12, 10))  # Adjust figsize to fit your needs
# sns.heatmap(X_valid.corr(), annot=True, fmt='0.2f')
# plt.show()

In [None]:
# #Predict and analyse the results
# #Compute permutation feature importance
# import matplotlib.pyplot as plt
# import numpy as np
# import seaborn as sns
# from sklearn.inspection import permutation_importance

# ###################################################
# def important_features_analysis(model, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1):
#     result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)    

#     # Summarize feature importance    
#     feature_importances = pd.DataFrame(result.importances_mean, index=X_test.columns, columns=['Permutation Importance'])
#     feature_importances['Permutation Importance'] = feature_importances['Permutation Importance'] / feature_importances['Permutation Importance'].sum()
#     feature_importances['Feature importance'] = model.feature_importances_
#     feature_importances = feature_importances.sort_values(by='Permutation Importance', ascending=False)
#     print(feature_importances)

# ###################################################
# def analyse_prediction(model , X_data, y_data , y_pred, random_state=RANDOM_STATE):
    
#     print("\n\nRMSE:", round(RMSE(y_pred, y_data),2))
#     print("STD", round(y_data.std(),2))
#     important_features_analysis(model, X_data, y_data)
   


# def plot_prediction(y_data, y_pred, title='Actual vs Predicted Prices'):
#     plt.figure(figsize=(10, 6))
#     sns.scatterplot(x=y_data, y=y_pred)
#     xx = np.linspace(y_data.min(), y_data.max(), 100)
#     plt.plot(xx, xx, 'r--')
#     plt.title(title)
#     plt.xlabel('actual')
#     plt.ylabel('predicted')
#     plt.show()

# ###################################################
# def predict(model,X_data , y_data=None, random_state=RANDOM_STATE):
#     y_pred = model.predict(X_data)
#     if y_data is None:
#         return y_pred
    
#     analyse_prediction(model, X_data , y_data , y_pred, random_state=random_state)    
#     return y_pred
# ###################################################
# def summary(model,X_train, X_test, y_train, y_test, random_state=RANDOM_STATE):
    
#     y_train_pred = predict(model,X_train, y_train,random_state=random_state)
#     y_test_pred = predict(model,X_test, y_test,random_state=random_state)   
    
#     plot_prediction(y_train, y_train_pred,"Actual vs Predicted Prices (Train)")
     
#     plot_prediction(y_test, y_test_pred,"Actual vs Predicted Prices (Test)")
   
#     return y_train_pred, y_test_pred

# ###################################################
# def summary2(model,X_train, X_test, y_train, y_test, random_state=RANDOM_STATE):
    
#     y_train_pred, y_test_pred = summary(model, X_train, X_test, y_train, y_test, random_state=random_state)
#     return y_train_pred, y_test_pred
    
# ###################################################
# def plot_residuals(y_true, y_pred, title):
#     residuals = y_true - y_pred
#     plt.scatter(y_pred, residuals, alpha=0.5)
#     plt.title(title)
#     plt.xlabel('Predicted Values')
#     plt.ylabel('Residuals')
#     plt.axhline(y=0, color='r', linestyle='--')
#     plt.show()

# ###############################################################
# # Assuming the summary function is defined as in the provided code snippet
# def summary_with_residuals(model, X_train, X_test, y_train, y_test, random_state):
#     y_train_pred, y_test_pred = summary(model, X_train, X_test, y_train, y_test, random_state=random_state)
    
#     # Plotting residuals for training data
#     plot_residuals(y_train, y_train_pred, "Residuals for Training Data")
    
#     # Plotting residuals for test data
#     plot_residuals(y_test, y_test_pred, "Residuals for Test Data")


# # ---------------------------------------------------------------------------------
# # Call the modified summary function with residuals plotting
# summary_with_residuals(model, X_train, X_test, y_train, y_test, random_state=RANDOM_STATE)