In [1]:
import numpy as np 
import pandas as pd 

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler,MinMaxScaler,RobustScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression,RidgeCV,LassoCV,ElasticNetCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.model_selection import KFold
from sklearn import linear_model
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import svm
from sklearn.metrics import r2_score
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
train=pd.read_csv("/Users/gardasnagarjun/Documents/5001-Ind/train.csv")
labels=train["playtime_forever"]
test=pd.read_csv("/Users/gardasnagarjun/Documents/5001-Ind/test.csv")
data=pd.concat([train,test],ignore_index=True)
data=data.drop("playtime_forever",1)

In [3]:
train.head()

Unnamed: 0,id,playtime_forever,is_free,price,genres,categories,tags,purchase_date,release_date,total_positive_reviews,total_negative_reviews
0,0,0.0,False,3700.0,"Adventure,Casual,Indie","Single-player,Steam Trading Cards,Steam Cloud","Indie,Adventure,Story Rich,Casual,Atmospheric,...","Jul 2, 2018","10 Dec, 2013",372.0,96.0
1,1,0.016667,True,0.0,RPG,"Single-player,Partial Controller Support","Mod,Utilities,RPG,Game Development,Singleplaye...","Nov 26, 2016","12 Aug, 2015",23.0,0.0
2,2,0.0,False,5000.0,"Adventure,Casual,Indie","Single-player,Full controller support,Steam Tr...","Point & Click,Adventure,Story Rich,Comedy,Indi...","Jul 2, 2018","28 Jan, 2014",3018.0,663.0
3,3,1.533333,False,9900.0,"Action,RPG","Single-player,Multi-player,Steam Achievements,...","Medieval,RPG,Open World,Strategy,Sandbox,Actio...","Nov 28, 2016","31 Mar, 2010",63078.0,1746.0
4,4,22.333333,False,4800.0,"Action,Indie,Strategy","Single-player,Co-op,Steam Achievements,Full co...","Tower Defense,Co-op,Action,Strategy,Online Co-...","Mar 4, 2018","30 Jul, 2012",8841.0,523.0


In [4]:
#Null Values
data.isnull().sum(axis=0)

categories                0
genres                    0
id                        0
is_free                   0
price                     0
purchase_date             4
release_date              0
tags                      0
total_negative_reviews    4
total_positive_reviews    4
dtype: int64

In [5]:
data=data.fillna(value={"total_negative_reviews":data['total_negative_reviews'].mean(),
                                "total_positive_reviews":data['total_positive_reviews'].mean(),
                                "purchase_date":data['purchase_date'].mode()[0]})
data.isnull().sum(axis=0)

categories                0
genres                    0
id                        0
is_free                   0
price                     0
purchase_date             0
release_date              0
tags                      0
total_negative_reviews    0
total_positive_reviews    0
dtype: int64

In [6]:
#Adding 1 to change certain values of reviews from 0 to 1
data["total_negative_reviews"]=data["total_negative_reviews"]+1
data["total_positive_reviews"]=data["total_positive_reviews"]+1

In [7]:
#Taking tags as features (for now):
dataset=data.copy()
#OE_genres=dataset['genres'].str.get_dummies(sep=',')
OE_categors=dataset['categories'].str.get_dummies(sep=',')
#OE_tags=dataset['tags'].str.get_dummies(sep=',')
#Concating them to the dataset:
dataset=pd.concat([dataset,OE_categors],axis=1)
dataset.drop(['genres','categories','tags'],axis=1,inplace=True)
dataset.head()

Unnamed: 0,id,is_free,price,purchase_date,release_date,total_negative_reviews,total_positive_reviews,Captions available,Co-op,Commentary available,...,Single-player,Stats,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Workshop,SteamVR Collectibles,VR Support,Valve Anti-Cheat enabled
0,0,False,3700.0,"Jul 2, 2018","10 Dec, 2013",97.0,373.0,0,0,0,...,1,0,0,1,0,1,0,0,0,0
1,1,True,0.0,"Nov 26, 2016","12 Aug, 2015",1.0,24.0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2,False,5000.0,"Jul 2, 2018","28 Jan, 2014",664.0,3019.0,0,0,0,...,1,0,0,1,0,1,0,0,0,0
3,3,False,9900.0,"Nov 28, 2016","31 Mar, 2010",1747.0,63079.0,0,0,0,...,1,0,1,0,0,1,1,0,0,0
4,4,False,4800.0,"Mar 4, 2018","30 Jul, 2012",524.0,8842.0,0,1,0,...,1,0,1,1,1,1,0,0,0,0


In [8]:
#Review ratio (feature)
dataset['Review_ratio']=dataset["total_positive_reviews"]/dataset["total_negative_reviews"]
dataset.head()

Unnamed: 0,id,is_free,price,purchase_date,release_date,total_negative_reviews,total_positive_reviews,Captions available,Co-op,Commentary available,...,Stats,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Workshop,SteamVR Collectibles,VR Support,Valve Anti-Cheat enabled,Review_ratio
0,0,False,3700.0,"Jul 2, 2018","10 Dec, 2013",97.0,373.0,0,0,0,...,0,0,1,0,1,0,0,0,0,3.845361
1,1,True,0.0,"Nov 26, 2016","12 Aug, 2015",1.0,24.0,0,0,0,...,0,0,0,0,0,0,0,0,0,24.0
2,2,False,5000.0,"Jul 2, 2018","28 Jan, 2014",664.0,3019.0,0,0,0,...,0,0,1,0,1,0,0,0,0,4.546687
3,3,False,9900.0,"Nov 28, 2016","31 Mar, 2010",1747.0,63079.0,0,0,0,...,0,1,0,0,1,1,0,0,0,36.107041
4,4,False,4800.0,"Mar 4, 2018","30 Jul, 2012",524.0,8842.0,0,1,0,...,0,1,1,1,1,0,0,0,0,16.874046


In [9]:
#Converting purchase date to date format, We'll be using purchase date as a feature (for now)
purchase_date=pd.to_datetime(dataset["purchase_date"])
#release_date=pd.to_datetime(data["release_date"])
dataset.drop(["purchase_date","release_date"],axis=1,inplace=True)
dataset=pd.concat([dataset,purchase_date],axis=1)
dataset["purchase_date"]=dataset["purchase_date"].astype(np.int64)
dataset.head()

Unnamed: 0,id,is_free,price,total_negative_reviews,total_positive_reviews,Captions available,Co-op,Commentary available,Cross-Platform Multiplayer,Full controller support,...,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Workshop,SteamVR Collectibles,VR Support,Valve Anti-Cheat enabled,Review_ratio,purchase_date
0,0,False,3700.0,97.0,373.0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,3.845361,1530489600000000000
1,1,True,0.0,1.0,24.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,24.0,1480118400000000000
2,2,False,5000.0,664.0,3019.0,0,0,0,0,1,...,0,1,0,1,0,0,0,0,4.546687,1530489600000000000
3,3,False,9900.0,1747.0,63079.0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,36.107041,1480291200000000000
4,4,False,4800.0,524.0,8842.0,0,1,0,0,1,...,1,1,1,1,0,0,0,0,16.874046,1520121600000000000


In [10]:
#Removing unwanted columns:
dataset.drop(["total_negative_reviews","is_free","price","id"],axis=1,inplace=True)
dataset.head()

Unnamed: 0,total_positive_reviews,Captions available,Co-op,Commentary available,Cross-Platform Multiplayer,Full controller support,In-App Purchases,Includes Source SDK,Includes level editor,Local Co-op,...,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Workshop,SteamVR Collectibles,VR Support,Valve Anti-Cheat enabled,Review_ratio,purchase_date
0,373.0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,3.845361,1530489600000000000
1,24.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,24.0,1480118400000000000
2,3019.0,0,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,4.546687,1530489600000000000
3,63079.0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,36.107041,1480291200000000000
4,8842.0,0,1,0,0,1,0,0,0,0,...,1,1,1,1,0,0,0,0,16.874046,1520121600000000000


In [11]:
#Splitting the Data into its original training and test data
train_data = dataset.loc[:356, :]
predict_data = dataset.loc[357:]
predict_data.reset_index(drop=True, inplace=True)

In [12]:
train_data

Unnamed: 0,total_positive_reviews,Captions available,Co-op,Commentary available,Cross-Platform Multiplayer,Full controller support,In-App Purchases,Includes Source SDK,Includes level editor,Local Co-op,...,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Workshop,SteamVR Collectibles,VR Support,Valve Anti-Cheat enabled,Review_ratio,purchase_date
0,373.0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,3.845361,1530489600000000000
1,24.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,24.000000,1480118400000000000
2,3019.0,0,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,4.546687,1530489600000000000
3,63079.0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,0,0,36.107041,1480291200000000000
4,8842.0,0,1,0,0,1,0,0,0,0,...,1,1,1,1,0,0,0,0,16.874046,1520121600000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352,151.0,0,0,0,0,0,0,0,0,0,...,1,1,0,1,0,0,0,0,1.641304,1511481600000000000
353,19009.0,0,0,0,0,1,0,0,0,0,...,1,1,1,1,0,0,0,0,3.919381,1534291200000000000
354,5100.0,0,0,0,0,1,0,0,0,0,...,1,1,1,1,0,0,0,0,2.965116,1517270400000000000
355,719.0,0,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,4.493750,1506124800000000000


In [13]:
predict_data

Unnamed: 0,total_positive_reviews,Captions available,Co-op,Commentary available,Cross-Platform Multiplayer,Full controller support,In-App Purchases,Includes Source SDK,Includes level editor,Local Co-op,...,Steam Achievements,Steam Cloud,Steam Leaderboards,Steam Trading Cards,Steam Workshop,SteamVR Collectibles,VR Support,Valve Anti-Cheat enabled,Review_ratio,purchase_date
0,2608.0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,2.322351,1540684800000000000
1,5763.0,0,0,0,1,0,0,0,0,0,...,1,1,0,1,0,0,0,0,2.577370,1563580800000000000
2,688.0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,5.134328,1563235200000000000
3,68.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1.700000,1517270400000000000
4,40345.0,0,1,0,0,1,0,0,0,0,...,1,1,0,1,1,0,0,0,10.877595,1487894400000000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,9335.0,0,0,0,0,1,0,0,0,0,...,1,1,1,1,0,0,0,0,18.056093,1521763200000000000
86,1246.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7.039548,1520035200000000000
87,33129.0,0,0,0,1,0,0,0,0,0,...,1,1,0,1,1,0,0,0,5.216344,1518912000000000000
88,6.0,0,1,0,1,0,0,0,0,0,...,1,1,1,0,0,0,0,0,6.000000,1511049600000000000


In [14]:
features=train_data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(features,labels,test_size=0.3,random_state=0)
print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

(249, 32) (249,)
(108, 32) (108,)


In [15]:
rf=RandomForestRegressor(n_estimators=1000,max_depth=20,random_state=42)
rf.fit(x_train,y_train)
predict_y = rf.predict(x_train).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train, predict_y)))

predict_y = rf.predict(x_test).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test, predict_y)))

training error: 3.338339320270664
testing error: 14.532231359663674


In [16]:
predict_submission=rf.predict(predict_data).clip(min=0)
submission = pd.read_csv("/Users/gardasnagarjun/Documents/5001-Ind/Submission/samplesubmission.csv")
submission["playtime_forever"] = predict_submission
output_file="/Users/gardasnagarjun/Documents/5001-Ind/Submission/submission_RFR(Refined_Categories).csv"
submission.to_csv(output_file, index=False)

In [18]:
from sklearn.decomposition import PCA
comp=train_data.shape[0]
pca=PCA(n_components=32)
X_scaled_pca=pca.fit_transform(train_data)
test_X_scaled_pca = pca.transform(predict_data)

In [19]:
X_scaled_pca.shape,test_X_scaled_pca

((357, 32), array([[-1.90614857e+16, -9.60031141e+03, -6.66975341e+00, ...,
         -1.77482094e-03, -4.22115503e-02,  5.61880665e-03],
        [-4.19574857e+16, -2.66870281e+03, -5.63183307e+00, ...,
          1.07839024e-01, -6.03061217e-04,  6.63626695e-02],
        [-4.16118857e+16, -7.80070811e+03, -2.93236589e+00, ...,
          2.69700558e-02, -2.26656594e-02,  3.40848649e-02],
        ...,
        [ 2.71131429e+15,  1.73293477e+04, -5.94607359e+00, ...,
          3.13514357e-02,  2.43188794e-01, -3.92229917e-02],
        [ 1.05737143e+16, -1.70905254e+04, -3.95758479e+00, ...,
         -5.65192375e-02, -1.22959302e-02,  1.16996059e-02],
        [-2.14806857e+16,  5.96872667e+03, -2.71644436e+00, ...,
         -5.22340176e-02,  1.31124606e-01, -2.91596665e-03]]))

In [21]:
from sklearn.model_selection import train_test_split
x_train_pca,x_test_pca,y_train_pca,y_test_pca=train_test_split(X_scaled_pca,labels,test_size=0.2,random_state=0)
print (x_train.shape, y_train.shape)
print (x_test.shape, y_test.shape)

(249, 32) (249,)
(108, 32) (108,)


In [23]:
rf=RandomForestRegressor(n_estimators=15,max_depth=30,random_state=10)
rf.fit(X_scaled_pca,labels)
predict_y = rf.predict(x_train_pca).clip(min=0)
print("training error:",np.sqrt(mean_squared_error(y_train_pca, predict_y)))

predict_y = rf.predict(x_test_pca).clip(min=0)
print("testing error:", np.sqrt(mean_squared_error(y_test_pca, predict_y)))

training error: 4.865157949715184
testing error: 4.7183861216090195


In [287]:
predict_submission=rf.predict(test_X_scaled_pca).clip(min=0)
submission = pd.read_csv("/Users/gardasnagarjun/Documents/5001/Submission/samplesubmission.csv")
submission["playtime_forever"] = predict_submission
submission.to_csv("submission_RFR(PCA).csv", index=False)