In [2]:
import numpy as np 
import pandas as pd 

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression,RidgeCV,LassoCV,ElasticNetCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.metrics import r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

In [3]:
train=pd.read_csv("/Users/gardasnagarjun/Documents/5001-Ind/train.csv")
labels=train["playtime_forever"]
test=pd.read_csv("/Users/gardasnagarjun/Documents/5001-Ind/test.csv")
data=pd.concat([train,test],ignore_index=True)
data=data.drop("playtime_forever",1)

In [4]:
data.head()

Unnamed: 0,categories,genres,id,is_free,price,purchase_date,release_date,tags,total_negative_reviews,total_positive_reviews
0,"Single-player,Steam Trading Cards,Steam Cloud","Adventure,Casual,Indie",0,False,3700.0,"Jul 2, 2018","10 Dec, 2013","Indie,Adventure,Story Rich,Casual,Atmospheric,...",96.0,372.0
1,"Single-player,Partial Controller Support",RPG,1,True,0.0,"Nov 26, 2016","12 Aug, 2015","Mod,Utilities,RPG,Game Development,Singleplaye...",0.0,23.0
2,"Single-player,Full controller support,Steam Tr...","Adventure,Casual,Indie",2,False,5000.0,"Jul 2, 2018","28 Jan, 2014","Point & Click,Adventure,Story Rich,Comedy,Indi...",663.0,3018.0
3,"Single-player,Multi-player,Steam Achievements,...","Action,RPG",3,False,9900.0,"Nov 28, 2016","31 Mar, 2010","Medieval,RPG,Open World,Strategy,Sandbox,Actio...",1746.0,63078.0
4,"Single-player,Co-op,Steam Achievements,Full co...","Action,Indie,Strategy",4,False,4800.0,"Mar 4, 2018","30 Jul, 2012","Tower Defense,Co-op,Action,Strategy,Online Co-...",523.0,8841.0


In [5]:
#Null Values
data.isnull().sum(axis=0)

categories                0
genres                    0
id                        0
is_free                   0
price                     0
purchase_date             4
release_date              0
tags                      0
total_negative_reviews    4
total_positive_reviews    4
dtype: int64

In [6]:
data=data.fillna(value={"total_negative_reviews":data['total_negative_reviews'].mean(),
                                "total_positive_reviews":data['total_positive_reviews'].mean(),
                                "purchase_date":data['purchase_date'].mode()[0]})
data.isnull().sum(axis=0)

categories                0
genres                    0
id                        0
is_free                   0
price                     0
purchase_date             0
release_date              0
tags                      0
total_negative_reviews    0
total_positive_reviews    0
dtype: int64

In [7]:
#Adding 1 to change certain values of reviews from 0 to 1
data["total_negative_reviews"]=data["total_negative_reviews"]+1
data["total_positive_reviews"]=data["total_positive_reviews"]+1

In [8]:
#Taking tags as features (for now):
dataset=data.copy()
#OE_genres=dataset['genres'].str.get_dummies(sep=',')
#OE_categors=dataset['categories'].str.get_dummies(sep=',')
OE_tags=dataset['tags'].str.get_dummies(sep=',')
#Concating them to the dataset:
dataset=pd.concat([dataset,OE_tags],axis=1)
dataset.drop(['genres','categories','tags'],axis=1,inplace=True)
dataset.head()

Unnamed: 0,id,is_free,price,purchase_date,release_date,total_negative_reviews,total_positive_reviews,1980s,1990's,2.5D,...,Voxel,Walking Simulator,War,Wargame,Warhammer 40K,Western,World War I,World War II,Zombies,eSports
0,0,False,3700.0,"Jul 2, 2018","10 Dec, 2013",97.0,373.0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,True,0.0,"Nov 26, 2016","12 Aug, 2015",1.0,24.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,False,5000.0,"Jul 2, 2018","28 Jan, 2014",664.0,3019.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,False,9900.0,"Nov 28, 2016","31 Mar, 2010",1747.0,63079.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,False,4800.0,"Mar 4, 2018","30 Jul, 2012",524.0,8842.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
#Review ratio (feature)
dataset['Review_ratio']=dataset["total_positive_reviews"]/dataset["total_negative_reviews"]
dataset.head()

Unnamed: 0,id,is_free,price,purchase_date,release_date,total_negative_reviews,total_positive_reviews,1980s,1990's,2.5D,...,Walking Simulator,War,Wargame,Warhammer 40K,Western,World War I,World War II,Zombies,eSports,Review_ratio
0,0,False,3700.0,"Jul 2, 2018","10 Dec, 2013",97.0,373.0,0,0,0,...,1,0,0,0,0,0,0,0,0,3.845361
1,1,True,0.0,"Nov 26, 2016","12 Aug, 2015",1.0,24.0,0,0,0,...,0,0,0,0,0,0,0,0,0,24.0
2,2,False,5000.0,"Jul 2, 2018","28 Jan, 2014",664.0,3019.0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.546687
3,3,False,9900.0,"Nov 28, 2016","31 Mar, 2010",1747.0,63079.0,0,0,0,...,0,0,0,0,0,0,0,0,0,36.107041
4,4,False,4800.0,"Mar 4, 2018","30 Jul, 2012",524.0,8842.0,0,0,0,...,0,0,0,0,0,0,0,0,0,16.874046


In [10]:
#Converting purchase date to date format, We'll be using purchase date as a feature (for now)
purchase_date=pd.to_datetime(dataset["purchase_date"])
release_date=pd.to_datetime(data["release_date"])
dataset.drop(["purchase_date","release_date"],axis=1,inplace=True)
dataset=pd.concat([dataset,purchase_date,release_date],axis=1)
#dataset["purchase_date"]=dataset["purchase_date"].astype(np.int64)
dataset.head()

Unnamed: 0,id,is_free,price,total_negative_reviews,total_positive_reviews,1980s,1990's,2.5D,2D,3D,...,Wargame,Warhammer 40K,Western,World War I,World War II,Zombies,eSports,Review_ratio,purchase_date,release_date
0,0,False,3700.0,97.0,373.0,0,0,0,0,0,...,0,0,0,0,0,0,0,3.845361,2018-07-02,2013-12-10
1,1,True,0.0,1.0,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,24.0,2016-11-26,2015-08-12
2,2,False,5000.0,664.0,3019.0,0,0,0,1,0,...,0,0,0,0,0,0,0,4.546687,2018-07-02,2014-01-28
3,3,False,9900.0,1747.0,63079.0,0,0,0,0,0,...,0,0,0,0,0,0,0,36.107041,2016-11-28,2010-03-31
4,4,False,4800.0,524.0,8842.0,0,0,0,0,0,...,0,0,0,0,0,0,0,16.874046,2018-03-04,2012-07-30


In [11]:
#interval between the purchase date and release date:
dataset["P-R_interval"]=(dataset['purchase_date']-dataset['release_date']).dt.days
dataset.tail()

Unnamed: 0,id,is_free,price,total_negative_reviews,total_positive_reviews,1980s,1990's,2.5D,2D,3D,...,Warhammer 40K,Western,World War I,World War II,Zombies,eSports,Review_ratio,purchase_date,release_date,P-R_interval
442,85,False,5000.0,517.0,9335.0,0,0,0,0,0,...,0,1,0,0,0,0,18.056093,2018-03-23,2013-05-22,1766
443,86,False,3600.0,177.0,1246.0,0,0,0,0,0,...,0,0,0,0,0,0,7.039548,2018-03-03,2009-09-14,3092
444,87,False,11200.0,6351.0,33129.0,0,0,0,0,0,...,0,0,0,0,0,0,5.216344,2018-02-18,2013-08-13,1650
445,88,True,0.0,1.0,6.0,0,0,0,0,0,...,0,0,0,0,0,0,6.0,2017-11-19,2017-06-19,153
446,89,False,26800.0,2595.0,17778.0,0,0,0,0,0,...,1,0,0,0,0,0,6.850867,2018-11-25,2017-09-28,423


In [12]:
#Removing unwanted columns:
dataset.drop(["total_negative_reviews","is_free","price","id","purchase_date","release_date"],axis=1,inplace=True)
#dataset.head()
data.isnull().sum(axis=0)


categories                0
genres                    0
id                        0
is_free                   0
price                     0
purchase_date             0
release_date              0
tags                      0
total_negative_reviews    0
total_positive_reviews    0
dtype: int64

In [13]:
#Splitting the Data into its original training and test data
train_data = dataset.loc[:356, :]
predict_data = dataset.loc[357:]
predict_data.reset_index(drop=True, inplace=True)

In [14]:
train_data

Unnamed: 0,total_positive_reviews,1980s,1990's,2.5D,2D,3D,3D Platformer,3D Vision,4 Player Local,4X,...,War,Wargame,Warhammer 40K,Western,World War I,World War II,Zombies,eSports,Review_ratio,P-R_interval
0,373.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.845361,1665
1,24.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,24.000000,472
2,3019.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.546687,1616
3,63079.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,36.107041,2434
4,8842.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,16.874046,2043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,151.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.641304,339
353,19009.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3.919381,1149
354,5100.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2.965116,817
355,719.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.493750,477


In [15]:
predict_data

Unnamed: 0,total_positive_reviews,1980s,1990's,2.5D,2D,3D,3D Platformer,3D Vision,4 Player Local,4X,...,War,Wargame,Warhammer 40K,Western,World War I,World War II,Zombies,eSports,Review_ratio,P-R_interval
0,2608.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2.322351,2350
1,5763.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2.577370,452
2,688.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,5.134328,848
3,68.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.700000,3850
4,40345.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,10.877595,1282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,9335.0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,18.056093,1766
86,1246.0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,7.039548,3092
87,33129.0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,5.216344,1650
88,6.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6.000000,153


In [16]:
features=train_data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(features,labels,test_size=0.3,random_state=0)
print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

(249, 320) (249,)
(108, 320) (108,)


In [26]:
rf=RandomForestRegressor(n_estimators=1000,max_depth=30,random_state=46)
rf.fit(x_train,y_train)
predict_y = rf.predict(x_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = rf.predict(x_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 3.50369528788461
testing error: 14.188504835672601


In [27]:
predict_submission=rf.predict(predict_data).clip(min=0)
submission = pd.read_csv("/Users/gardasnagarjun/Documents/5001-Ind/Submission/samplesubmission.csv")
submission["playtime_forever"] = predict_submission
output_file="/Users/gardasnagarjun/Documents/5001-Ind/Submission/submission_RFR(Refined-Tags).csv"
submission.to_csv(output_file, index=False)

In [28]:
#Using GridSearchCV:
param_grid = [{'max_depth':[10, 20, 30, 40, 50, 60, 70, 80, 100], 
                     'max_leaf_nodes':[10, 50, 100, 200],
                     'n_estimators': [10,20, 30, 40, 50,100,200,300,400,500]}]
tuned_model = GridSearchCV(RandomForestRegressor(), param_grid, cv=5,n_jobs=-1,
                   scoring='neg_mean_squared_error')

tuned_model.fit(x_train, y_train)
predict_y = tuned_model.predict(x_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = tuned_model.predict(x_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 4.071257448937463
testing error: 14.050924616816095




In [29]:
predict_submission=tuned_model.predict(predict_data).clip(min=0)
submission = pd.read_csv("/Users/gardasnagarjun/Documents/5001-Ind/Submission/samplesubmission.csv")
submission["playtime_forever"] = predict_submission
output_file="/Users/gardasnagarjun/Documents/5001-Ind/Submission/submission_RFR(Refined-Tags_GridCV).csv"
submission.to_csv(output_file, index=False)

In [30]:
print(tuned_model.best_params_,"\n")

{'max_depth': 80, 'max_leaf_nodes': 200, 'n_estimators': 10} 



In [31]:
f=RandomForestRegressor()
f.fit(x_train,y_train)
predict_y = f.predict(x_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = f.predict(x_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 2.9809033502578983
testing error: 14.643805576746342


In [32]:
predict_submission=f.predict(predict_data).clip(min=0)
submission = pd.read_csv("/Users/gardasnagarjun/Documents/5001-Ind/Submission/samplesubmission.csv")
submission["playtime_forever"] = predict_submission
output_file="/Users/gardasnagarjun/Documents/5001-Ind/Submission/submission_RFR(Refined-No_param).csv"
submission.to_csv(output_file, index=False)