In [98]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

### Using Smote Oversampling

In [99]:
Training_data = pd.read_csv('train.csv', index_col=0)
df = Training_data.copy()
df = df.dropna()
df = df[df.Landsize != 0.0]
df = df.drop(['Bedrooms'], axis =1)
df = pd.get_dummies(df,columns=['Type','Method','Regionname'])
#----------------------------------------------------
# Fixing the Test Data
Test_data = pd.read_csv('test.csv', index_col=0)
Test_data = Test_data.drop(['Bedrooms'], axis =1)
Test_data = pd.get_dummies(Test_data,columns=['Type','Method','Regionname'])
#-----------------------------------------------------
sm = SMOTE(sampling_strategy='auto', random_state=7)

# Fit the model to generate the data.
oversampled_trainX, oversampled_trainY = sm.fit_resample(df.drop('Price class', axis=1), df['Price class'])
oversampled_train = pd.concat([pd.DataFrame(oversampled_trainY), pd.DataFrame(oversampled_trainX)], axis=1)
df = oversampled_train

#_---------------------------------------------------

X = df.iloc[:,1:].copy()
y = df.iloc[:,0].copy()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100, stratify= y)
#----------------------------------------------------

forest = RandomForestClassifier(criterion='gini',
                                        n_estimators=100, 
                                        random_state= 100,
                                        n_jobs=-1)
forest.fit(X_train, y_train)
FTrain = forest.score(X_train, y_train)
FTest  = forest.score(X_test, y_test)

#------------------------------------------------------
sc = StandardScaler()
sc.fit(X_train)

X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

svm = SVC(kernel='rbf', C=20, random_state=100)
svm.fit(X_train_sc, y_train)

print('Train accuracy svm: {0:.5f}'.format(svm.score(X_train_sc, y_train)))
print('Test accuracy svm: {0:.5f}'.format(svm.score(X_test_sc, y_test)))
print()

print('Train accuracy forest : {0:.5f}'.format(forest.score(X_train, y_train)))
print('Test accuracy forest : {0:.5f}'.format(forest.score(X_test, y_test)))


Train accuracy svm: 0.92063
Test accuracy svm: 0.87038

Train accuracy forest : 0.99988
Test accuracy forest : 0.91227


In [70]:
forest_xtrain = X
forest_ytrain = y
forest_test = Test_data

forest = RandomForestClassifier(criterion='gini',
                                n_estimators=100,
                                random_state= 100,
                                n_jobs=-1)
forest.fit(forest_xtrain, forest_ytrain)

forest_target_predictions = forest.predict(forest_test)
output = pd.DataFrame({'index': forest_test.index,'Price class': forest_target_predictions})
output.to_csv('CA4_submission_forest',index=False)

In [71]:
svc_xtrain = X
svc_ytrain = y
svc_test = Test_data

Train_sc = sc.transform(X)
Test_sc = sc.transform(svc_test)

svm.fit(Train_sc,y)

svc_target_predictions = svm.predict(Test_sc)

output = pd.DataFrame({'index': svc_test.index,'Price class': svc_target_predictions})
output.to_csv('CA4_submission_svc',index=False)

### Using Tomek links undersampling 

In [92]:
Training_data = pd.read_csv('train.csv', index_col=0)
df = Training_data.copy()
df = df.drop(['YearBuilt', 'Bedrooms'], axis =1)
df = df.dropna()
df = df[df.Landsize != 0.0]
df = pd.get_dummies(df,columns=['Type','Method','Regionname'])
df.shape

(9761, 26)

In [93]:
tl = TomekLinks()

undersample_trainX, undersample_trainY = tl.fit_resample(df.drop('Price class', axis=1), df['Price class'])
undersample_train = pd.concat([pd.DataFrame(undersample_trainY), pd.DataFrame(undersample_trainX)], axis=1)
df = undersample_train

In [None]:
## mean encoding
mean_encoding_weather = df.groupby(['weather_situation'])['temperature'].mean().to_dict()
mean_encoding_season = df.groupby(['season'])['temperature'].mean().to_dict()

df['mean_weather'] = df['weather_situation'].map(mean_encoding_weather)
df['mean_season'] = df['season'].map(mean_encoding_season)

## freq encoding
freq_encoding_hour = df['hour'].value_counts()
freq_encoding_month = df['month'].value_counts()
freq_encoding_weekday = df['weekday'].value_counts()

df['freq_hour'] = df['hour'].map(freq_encoding_hour)
df['freq_month'] = df['month'].map(freq_encoding_month)
df['freq_weekday'] = df['weekday'].map(freq_encoding_weekday)

#scaling 
min_max_scaler = MinMaxScaler()
df[['freq_hour','freq_month','freq_weekday']] = min_max_scaler.fit_transform(df[['freq_hour','freq_month','freq_weekday']])

# Dropping the rest columns
df = df.drop(['hour','month','weekday','weather_situation','season'], axis= 1)

In [None]:
mean_encoding_hour = dft.groupby(['hour'])['rental_bikes_count'].mean().to_dict()
mean_encoding_month = dft.groupby(['month'])['rental_bikes_count'].mean().to_dict()
mean_encoding_weekday = dft.groupby(['weekday'])['rental_bikes_count'].mean().to_dict()
mean_encoding_weather = dft.groupby(['weather_situation'])['rental_bikes_count'].mean().to_dict()
mean_encoding_season = dft.groupby(['season'])['rental_bikes_count'].mean().to_dict()

dft['mean_hour'] = dft['hour'].map(mean_encoding_hour)
dft['mean_month'] = dft['month'].map(mean_encoding_month)
dft['mean_weekday'] = dft['weekday'].map(mean_encoding_weekday)
dft['mean_weather'] = dft['weather_situation'].map(mean_encoding_weather)
dft['mean_season'] = dft['season'].map(mean_encoding_season)

#scaling 
min_max_scaler = MinMaxScaler()
dft[['mean_hour','mean_month','mean_weekday','mean_weather','mean_season']] = min_max_scaler.fit_transform(dft[['mean_hour','mean_month','mean_weekday','mean_weather','mean_season']])

# Dropping the rest columns
dft = dft.drop(['hour','month','weekday','weather_situation','season'], axis= 1)

In [95]:
X,y = df.drop('Price class', axis = 1).copy(), df['Price class'].copy()
#X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=100, stratify= y)

y

0       1
1       0
2       0
3       0
4       1
       ..
8481    1
8482    2
8483    0
8484    0
8485    0
Name: Price class, Length: 8486, dtype: int64

In [97]:
#----------------------------------------------------

forest = RandomForestClassifier(criterion='gini',
                                        n_estimators=500, 
                                        random_state= 100,
                                        n_jobs=-1)
forest.fit(X_train, y_train)
FTrain = forest.score(X_train, y_train)
FTest  = forest.score(X_test, y_test)

#------------------------------------------------------
sc = StandardScaler()
sc.fit(X_train)

X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test)

svm = SVC(kernel='rbf', C=20, random_state=100)
svm.fit(X_train_sc, y_train)

print('Train accuracy forest: {0:.5f}'.format(svm.score(X_train_sc, y_train)))
print('Test accuracy forest: {0:.5f}'.format(svm.score(X_test_sc, y_test)))

print('Train accuracy SVM : {0:.5f}'.format(forest.score(X_train, y_train)))
print('Test accuracy SVM : {0:.5f}'.format(forest.score(X_test, y_test)))


Train accuracy forest: 0.92063
Test accuracy forest: 0.87038
Train accuracy SVM : 0.99988
Test accuracy SVM : 0.90833
