In [None]:
! pip install gdown
! pip install pandas
! pip install seaborn
! pip install numpy
! pip install matplotlib
! pip install scikit-learn
! pip install pathlib


import gdown
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pathlib import Path

def download_from_gdrive(url, filename):
    # Extract the file ID from the URL
    file_id = url.split('/')[-2]
    download_url = f"https://drive.google.com/uc?id={file_id}"

    # Download the file
    if Path(filename).exists():
        print(f"File '{filename}' already exists. Skipping download.")
    else:
        gdown.download(download_url, filename, quiet=False)
        print(f"File downloaded as: {filename}")

train = 'https://drive.google.com/file/d/1guqSpDv1Q7ZZjSbXMYGbrTvGns0VCyU5/view?usp=drive_link'
valid = 'https://drive.google.com/file/d/1j7x8xhMimKbvW62D-XeDfuRyj9ia636q/view?usp=drive_link'
# Example usage

download_from_gdrive(train, 'train.csv')
download_from_gdrive(valid, 'valid.csv')


RANDOM_STATE = 42
MAX_DEPTH = 20

df = pd.read_csv('train.csv')
df.head()


In [None]:

def manipulate_data(df):
    df2 = df.copy()

    # column_nunique = df.nunique()
    # print(column_nunique)

    df2['UsageBand'] = df2['UsageBand'].astype('category')
    df2 = pd.get_dummies(df2, columns=['UsageBand'], prefix='UsageBand')
    
    # handle dates in processing
    df2['Saledate'] = pd.to_datetime(df.saledate)
    df2 = df2.select_dtypes(['number', 'datetime','bool'])  # drop all categorical variables

    #df2 = df2.dropna()  # drop all rows with missing values
    #df2 = df2[df2['YearMade'] >= 1900]  # remove outliers
    #df2 = df2.drop_duplicates()  # drop duplicates

    # Feature engineering with dates
    df2['Age'] = df2['Saledate'].dt.year - df2['YearMade']
    df2['SaleYear'] =  df2['Saledate'].dt.year
    df2['SaleMonth'] =  df2['Saledate'].dt.month
    # machineid is a unique identifier, so we drop it
    # sale date is redundant, so we drop it
    # modelid is a unique identifier, so we drop it
    df2 = df2.drop(columns=['Saledate'])
    df2 = df2.drop(columns=['MachineID'])
    # df2 = df2.drop(columns=['ModelID'])  # don't touch this one !     
  
    #df2 = df2.set_index('SalesID')  # set the index to the unique identifier
    return df2

In [None]:

# All the functions below are for the purpose of training a model
# Compute permutation feature importance

from sklearn.inspection import permutation_importance

def important_features_analysis(model, X_test, y_test):
    result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)    

    # Summarize feature importance    
    feature_importances = pd.DataFrame(result.importances_mean, index=X_test.columns, columns=['Permutation Importance'])
    feature_importances['Permutation Importance'] = feature_importances['Permutation Importance'] / feature_importances['Permutation Importance'].sum()
    feature_importances['Feature importance'] = model.feature_importances_
    feature_importances = feature_importances.sort_values(by='Permutation Importance', ascending=False)
    print(feature_importances)

    
def train(model, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    return X_train, X_test, y_train, y_test

def RMSE(y_pred, y_true):
    return ((y_pred - y_true) ** 2).mean() ** 0.5

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns



def analyse_prediction(model , X_data, y_data , y_pred):
    print("\n\nRMSE:", round(RMSE(y_pred, y_data),3))
    print("STD", round(y_data.std(),3))
    #important_features_analysis(model, X_data, y_data)

    # sns.scatterplot(x=y_data, y=y_pred)
    # xx = np.linspace(y_data.min(), y_data.max(), 100)

    # plt.plot(xx, xx, 'r--')
    # plt.xlabel('actual')
    # plt.ylabel('predicted')


def predict(model,X_data , y_data=None):
    y_pred = model.predict(X_data)

    if y_data is None:
        return y_pred
    
    analyse_prediction(model, X_data , y_data , y_pred)    
    return y_pred

def summary(model,X_train, X_test, y_train, y_test):
    
    y_train_pred = predict(model,X_train, y_train)
    y_test_pred = predict(model,X_test, y_test)    
   
    return y_train_pred, y_test_pred


def summary2(model,X_train, X_test, y_train, y_test):
    
    y_train_pred, y_test_pred = summary(model, X_train, X_test, y_train, y_test)
    return y_train_pred, y_test_pred


In [None]:
### MAIN #########

# Clean data
df2 = manipulate_data(df)
df2 = df2.set_index('SalesID')  # set the index to the unique identifier
#df2 = df2.dropna()  # drop all rows with missing values


# Remove outliers using 6 sigma method
for col in df2.columns:    
    mean = df2[col].mean()
    std_dev = df2[col].std()    
    lower_bound = mean - 3 * std_dev
    upper_bound = mean + 3 * std_dev
    df2 = df2[(df2[col] >= lower_bound) & (df2[col] <= upper_bound)]
    #instead of dropna   
    df2[col] = df2[col].fillna(mean)

df2 = df2[df2['YearMade'] >= 1900]  # remove some outliers
df2 = df2.drop_duplicates()  # drop duplicates
# Prepare data for training
X = df2.drop(columns='SalePrice')
y = df2['SalePrice']
print("Data shape:", X.shape)


#model = RandomForestRegressor(random_state=RANDOM_STATE, max_depth=MAX_DEPTH)
model = RandomForestRegressor(random_state=RANDOM_STATE)

X_small = X.sample(20000)
y_small = y.loc[X_small.index]
X_train, X_test, y_train, y_test = train(model, X_small, y_small)


# Train model and summarize
summary2(model, X_train, X_test, y_train, y_test)

In [None]:

#df_valid = pd.read_csv('valid.csv')
#print("df -valid " , df_valid.info())
#df_index = df.drop(columns=['SalePrice']).columns
#X_valid = df_valid[df_index]
#X_valid = manipulate_data(df_valid)

X_valid = pd.read_csv('valid.csv')
X_valid = X_valid.set_index('SalesID')
#print(X_valid.head())
X_valid = manipulate_data(X_valid)
print(X_valid.shape)
# #he dumbest null handling possible - fill with mean value
# for col in X_valid.columns:
#     X_valid[col] = X_valid[col].fillna(X_valid[col].mean())

y_valid_pred = predict(model, X_valid)
#y_valid_pred = pd.Series(y_valid_pred, index=X_valid.index, name='SalePrice')
y_valid_pred = pd.Series(y_valid_pred, index=X_valid.index, name='SalePrice')
#y_valid_pred.info()                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          





In [None]:
from datetime import datetime

f'submission_{datetime.now().isoformat()}'
y_valid_pred.to_csv(f'submission_{datetime.now().isoformat()}.csv')
