In [11]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,PolynomialFeatures,MinMaxScaler
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report, f1_score,plot_confusion_matrix
from sklearn.linear_model import LinearRegression,Ridge,ElasticNet,Lasso,Perceptron
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.svm import SVC,LinearSVC
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedKFold,cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

In [12]:
Tr_data =  pd.read_pickle ('train.pkl')

In [13]:
# Cleaning the column names
columns = Tr_data.columns
for col in columns:
    Tr_data = Tr_data.rename(
        columns={
            # strip out parentheses, and their contents
            col: re.sub(r'\(.*', '', col)
            .strip() 
            .replace(' ', '_')
            .replace('-', '_') 
            .lower()  # lowercase the column name
        }
    )
df = Tr_data.copy()

In [14]:
# Removal of all values that has missing in to be able to see the visualisations.
missing_values = df[df.eq('missing').any(1)]
df = df.drop(df.index[list(missing_values.index)])

In [15]:
cat_columns =['year', 'month', 'hour', 'holiday', 'weekday', 'working_day',
       'temperature', 'feels_like_temperature',
       'humidity', 'windspeed', 'rental_bikes_count']
for i in cat_columns:
    df[i] = df[i].astype(float)

corr = df[list(cat_columns)].corr()
#plt.figure(figsize= (20,10))
#sns.heatmap(corr,annot = True, cmap = 'BrBG')
#plt.show()

In [16]:
#plt.figure(figsize = (20,10))
#sns.scatterplot(x = df['hour'],y = df['rental_bikes_count'],hue = df['season'])
#plt.show()
#df.dtypes


In [17]:
df = df.replace(to_replace=['Heavy rain, heavy snow or thunderstorm','Light snow or light rain','Misty and/or cloudy','Clear or partly cloudy'], value = [0,0,1,2])
df = df.replace(to_replace=['Summer','Winter','Fall','Spring'], value = [0,1,2,3])

#df = df.drop(['month','weekday','weather_situation','season','holiday'], axis= 1)


In [18]:
X,y = df.drop('rental_bikes_count', axis = 1).copy(), df['rental_bikes_count'].copy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.4,random_state=100)

In [19]:
# PIPELINES
DT_regressor = make_pipeline(DecisionTreeRegressor(max_depth=10))
RF_regressor = make_pipeline(RandomForestRegressor(n_estimators=100, random_state=10,n_jobs=-1))


In [20]:
DT_regressor.fit(X_train,y_train)
RF_regressor.fit(X_train,y_train)

print('Train accuracy DT: {0:.5f}'.format(DT_regressor.score(X_train, y_train)))
print('Test accuracy DT: {0:.5f}'.format(DT_regressor.score(X_test, y_test)))

print('Train accuracy RF: {0:.5f}'.format(RF_regressor.score(X_train, y_train)))
print('Test accuracy RF: {0:.5f}'.format(RF_regressor.score(X_test, y_test)))



Train accuracy DT: 0.93511
Test accuracy DT: 0.88117
Train accuracy RF: 0.99134
Test accuracy RF: 0.93779


In [21]:
dfc = df.copy()
dfc['rental_bikes_count'] = pd.cut(dfc['rental_bikes_count'], bins= 5, labels= [i for i in range(5)])

In [22]:
Xc,yc = dfc.drop('rental_bikes_count', axis = 1).copy(), dfc['rental_bikes_count'].copy()
Xc_train,Xc_test,yc_train,yc_test = train_test_split(Xc,yc,test_size=0.4,random_state=100)

In [23]:

forest = RandomForestClassifier(criterion='gini',
                                        n_estimators=60, 
                                        random_state= 100,
                                        max_depth= 20,
                                        n_jobs=-1)
forest.fit(Xc_train, yc_train)
FTrain = forest.score(Xc_train, yc_train)
FTest  = forest.score(Xc_test, yc_test)

#------------------------------------------------------
sc = StandardScaler()
sc.fit(Xc_train)

Xc_train_sc = sc.transform(Xc_train)
Xc_test_sc = sc.transform(Xc_test)

svm = SVC(kernel='rbf', C=20, random_state=100)
svm.fit(Xc_train_sc, yc_train)

print('Train accuracy svm: {0:.5f}'.format(svm.score(Xc_train_sc, yc_train)))
print('Test accuracy svm: {0:.5f}'.format(svm.score(Xc_test_sc, yc_test)))
print()

print('Train accuracy forest : {0:.5f}'.format(forest.score(Xc_train, yc_train)))
print('Test accuracy forest : {0:.5f}'.format(forest.score(Xc_test, yc_test)))


Train accuracy svm: 0.83274
Test accuracy svm: 0.75808

Train accuracy forest : 1.00000
Test accuracy forest : 0.81845


In [24]:
test_data = pd.read_pickle('test.pkl')

columns = test_data.columns
for col in columns:
    test_data = test_data.rename(
        columns={
            # strip out parentheses, and their contents
            col: re.sub(r'\(.*', '', col)
            .strip() 
            .replace(' ', '_')
            .replace('-', '_') 
            .lower()  # lowercase the column name
        }
    )

dft = test_data.copy()



In [25]:
dft = dft.replace(to_replace=['Heavy rain, heavy snow or thunderstorm','Light snow or light rain','Misty and/or cloudy','Clear or partly cloudy'], value = [0,0,1,1])
dft = dft.replace(to_replace=['Summer','Winter','Fall','Spring'], value = [0,1,2,3])

In [26]:

kgl_X,kgl_y = df.drop('rental_bikes_count', axis = 1).copy(), df['rental_bikes_count'].copy() 

RF_regressor.fit(kgl_X,kgl_y)
kgl_pred = RF_regressor.predict(dft)


output = pd.DataFrame({'idx': dft.index,'Rental bikes count': kgl_pred})
output.to_csv('CA5_submission_RF', index = False)
