In [1]:
#importing modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor

In [2]:
#downloading and reading the datasets
train_dataset = pd.read_csv('train_dataset.csv')
test_dataset = pd.read_csv('test_dataset.csv')
train_dataset.head()

Unnamed: 0,ID,Tag,Reputation,Answers,Username,Views,Upvotes
0,52664,a,3942.0,2.0,155623,7855.0,42.0
1,327662,a,26046.0,12.0,21781,55801.0,1175.0
2,468453,c,1358.0,4.0,56177,8067.0,60.0
3,96996,a,264.0,3.0,168793,27064.0,9.0
4,131465,c,4271.0,4.0,112223,13986.0,83.0


In [3]:
#Need to drop the columns which are not required like 'ID' & 'Username'
dataset_after_feature_removal = train_dataset.loc[:,['Tag','Reputation', 'Answers', 'Views']]
x_train, x_val, y_train, y_val = train_test_split(dataset_after_feature_removal, train_dataset['Upvotes'], test_size = 0.3)

#Will create pre-processing pipeline to handle categorical and Numerical features.
categorical_features = ['Tag']
categorical_transformer =  Pipeline(steps=[
    ('labelencoder', OneHotEncoder())
])


numerical_features = ['Reputation', 'Answers', 'Views']
numerical_transform = Pipeline(steps=[
    ('standardscalar', MinMaxScaler())
])

preprocess = ColumnTransformer(
    remainder = 'passthrough',
    transformers= [
        ('categorical', categorical_transformer, categorical_features),
        ('numerical', numerical_transform, numerical_features)
    ]
)

preprocess

ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('categorical', Pipeline(memory=None,
     steps=[('labelencoder', OneHotEncoder(categorical_features=None, categories=None,
       dtype=<class 'numpy.float64'>, handle_unknown='error',
       n_values=None, sparse=True))]), ['Tag']), ('numerical', Pipeline(memory=None,
     steps=[('standardscalar', MinMaxScaler(copy=True, feature_range=(0, 1)))]), ['Reputation', 'Answers', 'Views'])])

In [4]:
#Now performing preprocessing on the x_train and x_val dataset.
x_train_transform = preprocess.fit_transform(x_train)
x_val_transform = preprocess.fit_transform(x_val)

# Also need to convert y_train and y_val dataset into numpy array
y_train_transform = np.array(y_train)
y_val_transform = np.array(y_val)

In [5]:
#creating 2 model, one will use Gradient Boosting method and second one will use Adaboost Regressor
model_gradient_boosting = GradientBoostingRegressor(learning_rate = 0.1, n_estimators = 500, max_depth = 9)
model_gradient_boosting.fit(x_train_transform, y_train_transform)

#Will perform comparision in these 2 models
adaboost = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=500,learning_rate=0.1, random_state=5)
adaboost.fit(x_train_transform, y_train_transform)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=0.1, loss='linear', n_estimators=500,
         random_state=5)

In [9]:
#Now will perform the prediction using the models created 
pred_gradient_boosting = model_gradient_boosting.predict(x_val_transform)
gradient_boosting_mean_sqr = mean_squared_error(y_val_transform, pred_gradient_boosting)

pred_adaboosting = adaboost.predict(x_val_transform)
ada_boosting_mean_sqr = mean_squared_error(y_val_transform, pred_adaboosting)

print(gradient_boosting_mean_sqr)
print(ada_boosting_mean_sqr)

print(pred_adaboosting[1:50])
print(y_val_transform[1:50])

8920544.118900165
5970950.582344994
[6.7000e+01 1.0500e+02 2.7000e+01 2.5000e+01 7.0000e+00 5.6000e+01
 5.5000e+01 5.6000e+01 1.6200e+02 6.0000e+00 2.4000e+01 3.3000e+01
 1.0200e+02 1.0000e+00 1.4000e+01 9.0000e+00 1.5000e+01 5.0000e+00
 1.6000e+01 3.8800e+02 1.1636e+04 5.7100e+02 1.2000e+01 9.6400e+02
 6.0000e+00 4.8000e+01 5.4300e+02 2.0000e+00 3.5000e+01 2.1000e+01
 1.0000e+00 4.0000e+00 1.5300e+02 1.6000e+01 1.2000e+01 1.6600e+02
 1.6000e+01 4.4000e+01 8.0000e+00 3.4391e+04 1.2000e+01 7.1000e+01
 5.0000e+01 5.2000e+01 2.0000e+01 5.0000e+00 2.8900e+02 4.6800e+02
 5.0500e+02]
[3.8000e+01 5.4000e+01 4.6000e+01 2.1000e+01 5.0000e+00 7.2000e+01
 2.3000e+01 1.8000e+01 6.5000e+01 6.0000e+00 2.9000e+01 1.8000e+01
 1.5000e+02 9.0000e+00 6.0000e+00 3.0000e+00 2.7000e+01 0.0000e+00
 1.5000e+01 1.7800e+02 1.1130e+04 4.1900e+02 5.0000e+00 5.7700e+02
 2.3300e+02 3.2000e+01 2.5200e+02 3.0000e+00 1.6000e+01 2.5000e+01
 0.0000e+00 0.0000e+00 1.5800e+02 6.0000e+01 7.0000e+00 2.1000e+02
 1.1000e+01 6