In [1]:
import pandas as pd
import numpy as np
import re
import subprocess
from pathlib import Path 
import io
import dvc.api, dvc.repo

In [2]:
url = "https://github.com/mostafa-fallaha/data-drift-simulation"
data = dvc.api.read("new_data/Google-Playstore.csv", encoding='utf-8', repo=url)
df = pd.read_csv(io.StringIO(data))
df.head()

Unnamed: 0,Category,Rating,Rating Count,Installs,Free,Price,Size_M,Content Rating,Ad Supported,In App Purchases,Editors Choice,Released_Year,Released_Month,Days_Between,Daily_Avg_Installs
0,Tools,3.9,494.0,99306,True,0.0,0.1,Everyone,False,False,False,2010,12,0,99306.0
1,Finance,4.0,1269.0,317027,True,0.0,0.0,Everyone,False,False,False,2010,3,1,317027.0
2,Books & Reference,0.0,0.0,125,False,1.99,1.3,Everyone,False,False,False,2010,8,0,125.0
3,Productivity,3.2,17.0,8587,True,0.0,0.0,Everyone,True,False,False,2010,12,1,8587.0
4,Tools,3.8,30.0,6454,True,0.0,0.0,Everyone,False,False,False,2010,6,0,6454.0


In [3]:
df.shape

(90000, 15)

In [4]:
df.columns

Index(['Category', 'Rating', 'Rating Count', 'Installs', 'Free', 'Price',
       'Size_M', 'Content Rating', 'Ad Supported', 'In App Purchases',
       'Editors Choice', 'Released_Year', 'Released_Month', 'Days_Between',
       'Daily_Avg_Installs'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Category            90000 non-null  object 
 1   Rating              90000 non-null  float64
 2   Rating Count        90000 non-null  float64
 3   Installs            90000 non-null  int64  
 4   Free                90000 non-null  bool   
 5   Price               90000 non-null  float64
 6   Size_M              90000 non-null  float64
 7   Content Rating      90000 non-null  object 
 8   Ad Supported        90000 non-null  bool   
 9   In App Purchases    90000 non-null  bool   
 10  Editors Choice      90000 non-null  bool   
 11  Released_Year       90000 non-null  int64  
 12  Released_Month      90000 non-null  int64  
 13  Days_Between        90000 non-null  int64  
 14  Daily_Avg_Installs  90000 non-null  float64
dtypes: bool(4), float64(5), int64(4), object(2)
memory us

In [6]:
df

Unnamed: 0,Category,Rating,Rating Count,Installs,Free,Price,Size_M,Content Rating,Ad Supported,In App Purchases,Editors Choice,Released_Year,Released_Month,Days_Between,Daily_Avg_Installs
0,Tools,3.9,494.0,99306,True,0.00,0.1,Everyone,False,False,False,2010,12,0,99306.0
1,Finance,4.0,1269.0,317027,True,0.00,0.0,Everyone,False,False,False,2010,3,1,317027.0
2,Books & Reference,0.0,0.0,125,False,1.99,1.3,Everyone,False,False,False,2010,8,0,125.0
3,Productivity,3.2,17.0,8587,True,0.00,0.0,Everyone,True,False,False,2010,12,1,8587.0
4,Tools,3.8,30.0,6454,True,0.00,0.0,Everyone,False,False,False,2010,6,0,6454.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,Art & Design,0.0,0.0,355,True,0.00,11.0,Everyone,True,False,False,2021,3,0,355.0
89996,Finance,0.0,0.0,663732,True,0.00,35.0,Everyone,False,False,False,2021,2,16,41483.2
89997,Social,0.0,0.0,54,True,0.00,38.0,Teen,False,False,False,2021,3,18,3.0
89998,Business,3.2,9.0,1092,True,0.00,1.7,Everyone,False,False,False,2021,2,0,1092.0


In [7]:
df['Released_Year'].value_counts().sort_index()

Released_Year
2010        9
2011       64
2012      124
2013      264
2014      893
2015     2208
2016     3671
2017     7901
2018    11369
2019    20852
2020    28557
2021    14088
Name: count, dtype: int64

In [8]:
df.groupby('Released_Year')['Size_M'].mean()

Released_Year
2010     0.322222
2011     3.770313
2012     2.338710
2013     5.962879
2014     8.144569
2015    11.272011
2016    16.437374
2017    17.514606
2018    16.874800
2019    19.508057
2020    23.558529
2021    27.635349
Name: Size_M, dtype: float64

## Splitting the data into 3

In [32]:
df_A = df.loc[(df['Released_Year'] >= 2010) & (df['Released_Year'] <= 2017)]
df_A

Unnamed: 0,Category,Rating,Rating Count,Installs,Free,Price,Size_M,Content Rating,Ad Supported,In App Purchases,Editors Choice,Released_Year,Released_Month,Days_Between,Daily_Avg_Installs
0,Tools,3.9,494.0,99306,True,0.00,0.1,Everyone,False,False,False,2010,12,0,99306.0
1,Finance,4.0,1269.0,317027,True,0.00,0.0,Everyone,False,False,False,2010,3,1,317027.0
2,Books & Reference,0.0,0.0,125,False,1.99,1.3,Everyone,False,False,False,2010,8,0,125.0
3,Productivity,3.2,17.0,8587,True,0.00,0.0,Everyone,True,False,False,2010,12,1,8587.0
4,Tools,3.8,30.0,6454,True,0.00,0.0,Everyone,False,False,False,2010,6,0,6454.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15129,Entertainment,4.7,43.0,6940,True,0.00,5.3,Everyone,True,False,False,2017,12,0,6940.0
15130,Communication,0.0,0.0,22,True,0.00,6.5,Everyone,False,False,False,2017,8,0,22.0
15131,Music & Audio,0.0,0.0,288,True,0.00,5.1,Everyone,False,False,False,2017,4,2,144.0
15132,Personalization,0.0,0.0,14,True,0.00,6.5,Everyone,True,False,False,2017,6,0,14.0


In [33]:
df_A['Content Rating'].unique()

array(['Everyone', 'Teen', 'Unrated', 'Everyone 10+', 'Mature 17+'],
      dtype=object)

In [34]:
df_B = df.loc[df['Released_Year'].isin([2018, 2019])]
df_B

Unnamed: 0,Category,Rating,Rating Count,Installs,Free,Price,Size_M,Content Rating,Ad Supported,In App Purchases,Editors Choice,Released_Year,Released_Month,Days_Between,Daily_Avg_Installs
15134,Lifestyle,0.0,0.0,29,True,0.0,3.4,Everyone,True,False,False,2018,11,0,29.0
15135,Arcade,0.0,0.0,148,True,0.0,27.0,Everyone,True,True,False,2018,4,4,37.0
15136,Education,0.0,0.0,96,True,0.0,10.0,Everyone,False,False,False,2018,1,0,96.0
15137,Productivity,0.0,0.0,47,True,0.0,3.2,Everyone,False,False,False,2018,6,0,47.0
15138,Health & Fitness,4.5,67.0,7643,True,0.0,5.9,Everyone,False,False,False,2018,9,0,7643.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47350,Productivity,0.0,0.0,621,True,0.0,7.4,Everyone,False,False,False,2019,10,16,38.8
47351,Entertainment,0.0,0.0,130,True,0.0,4.7,Everyone,False,False,False,2019,11,7,18.6
47352,Adventure,0.0,0.0,356,True,0.0,54.0,Everyone,True,True,False,2019,10,1,356.0
47353,Business,0.0,0.0,3,True,0.0,21.0,Everyone,False,False,False,2019,6,0,3.0


In [35]:
df_B['Content Rating'].unique()

array(['Everyone', 'Teen', 'Everyone 10+', 'Mature 17+',
       'Adults only 18+', 'Unrated'], dtype=object)

In [29]:
filepath = Path('df_B.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath, index=False)

In [11]:
df_C = df.loc[df['Released_Year'].isin([2020, 2021])]
df_C

Unnamed: 0,Category,Rating,Rating Count,Installs,Free,Price,Size_M,Content Rating,Ad Supported,In App Purchases,Editors Choice,Released_Year,Released_Month,Days_Between,Daily_Avg_Installs
47355,Casual,0.0,0.0,3,True,0.0,2.0,Everyone,False,False,False,2020,1,0,3.0
47356,Productivity,0.0,0.0,347,True,0.0,25.0,Everyone,False,False,False,2020,6,0,347.0
47357,Shopping,5.0,19.0,146,True,0.0,1.5,Everyone,False,False,False,2020,11,10,14.6
47358,Lifestyle,0.0,0.0,56,True,0.0,5.9,Everyone,True,False,False,2020,1,0,56.0
47359,Puzzle,0.0,0.0,5,True,0.0,11.0,Everyone,True,False,False,2020,7,0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89995,Art & Design,0.0,0.0,355,True,0.0,11.0,Everyone,True,False,False,2021,3,0,355.0
89996,Finance,0.0,0.0,663732,True,0.0,35.0,Everyone,False,False,False,2021,2,16,41483.2
89997,Social,0.0,0.0,54,True,0.0,38.0,Teen,False,False,False,2021,3,18,3.0
89998,Business,3.2,9.0,1092,True,0.0,1.7,Everyone,False,False,False,2021,2,0,1092.0


In [30]:
filepath = Path('df_C.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath, index=False)

# Model

In [12]:
df_A

Unnamed: 0,Category,Rating,Rating Count,Installs,Free,Price,Size_M,Content Rating,Ad Supported,In App Purchases,Editors Choice,Released_Year,Released_Month,Days_Between,Daily_Avg_Installs
0,Tools,3.9,494.0,99306,True,0.00,0.1,Everyone,False,False,False,2010,12,0,99306.0
1,Finance,4.0,1269.0,317027,True,0.00,0.0,Everyone,False,False,False,2010,3,1,317027.0
2,Books & Reference,0.0,0.0,125,False,1.99,1.3,Everyone,False,False,False,2010,8,0,125.0
3,Productivity,3.2,17.0,8587,True,0.00,0.0,Everyone,True,False,False,2010,12,1,8587.0
4,Tools,3.8,30.0,6454,True,0.00,0.0,Everyone,False,False,False,2010,6,0,6454.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15129,Entertainment,4.7,43.0,6940,True,0.00,5.3,Everyone,True,False,False,2017,12,0,6940.0
15130,Communication,0.0,0.0,22,True,0.00,6.5,Everyone,False,False,False,2017,8,0,22.0
15131,Music & Audio,0.0,0.0,288,True,0.00,5.1,Everyone,False,False,False,2017,4,2,144.0
15132,Personalization,0.0,0.0,14,True,0.00,6.5,Everyone,True,False,False,2017,6,0,14.0


In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error, r2_score, accuracy_score

In [14]:
# Step 1: Convert boolean columns to 0 and 1
df_A['Free'] = df_A['Free'].replace({True: 1, False: 0})
df_A['Ad Supported'] = df_A['Ad Supported'].replace({True: 1, False: 0})
df_A['In App Purchases'] = df_A['In App Purchases'].replace({True: 1, False: 0})
df_A['Editors Choice'] = df_A['Editors Choice'].replace({True: 1, False: 0})

# Step 2: Convert categorical variables into dummy variables
df_A = pd.get_dummies(df_A, columns=['Category', 'Content Rating'])

# Step 3: Feature Scaling
scaler = StandardScaler()

# Selecting numerical columns for scaling
numerical_columns = ['Rating', 'Rating Count', 'Installs', 'Price', 'Size_M', 'Released_Year', 'Released_Month', 'Days_Between']
df_A[numerical_columns] = scaler.fit_transform(df_A[numerical_columns])

  df_A['Free'] = df_A['Free'].replace({True: 1, False: 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_A['Free'] = df_A['Free'].replace({True: 1, False: 0})
  df_A['Ad Supported'] = df_A['Ad Supported'].replace({True: 1, False: 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_A['Ad Supported'] = df_A['Ad Supported'].replace({True: 1, False: 0})
  df_A['In App Purchases'] = df_A['In App Purchases'].replace({True: 1, False: 0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead


In [15]:
df_A

Unnamed: 0,Rating,Rating Count,Installs,Free,Price,Size_M,Ad Supported,In App Purchases,Editors Choice,Released_Year,...,Category_Travel & Local,Category_Trivia,Category_Video Players & Editors,Category_Weather,Category_Word,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,0.829038,0.194000,0.376387,1,-0.060273,-0.307280,0,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
1,0.876499,0.629323,1.452899,1,-0.060273,-0.309289,0,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
2,-1.021950,-0.083483,-0.114009,0,0.618262,-0.283173,0,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
3,0.496810,-0.073934,-0.072169,1,-0.060273,-0.309289,1,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
4,0.781577,-0.066632,-0.082716,1,-0.060273,-0.309289,0,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15129,1.208728,-0.059329,-0.080313,1,-0.060273,-0.202814,1,0,0,0.747757,...,False,False,False,False,False,True,False,False,False,False
15130,-1.021950,-0.083483,-0.114519,1,-0.060273,-0.178706,0,0,0,0.747757,...,False,False,False,False,False,True,False,False,False,False
15131,-1.021950,-0.083483,-0.113203,1,-0.060273,-0.206832,0,0,0,0.747757,...,False,False,False,False,False,True,False,False,False,False
15132,-1.021950,-0.083483,-0.114558,1,-0.060273,-0.178706,1,0,0,0.747757,...,False,False,False,False,False,True,False,False,False,False


In [16]:
df_A.columns

Index(['Rating', 'Rating Count', 'Installs', 'Free', 'Price', 'Size_M',
       'Ad Supported', 'In App Purchases', 'Editors Choice', 'Released_Year',
       'Released_Month', 'Days_Between', 'Daily_Avg_Installs',
       'Category_Action', 'Category_Adventure', 'Category_Arcade',
       'Category_Art & Design', 'Category_Auto & Vehicles', 'Category_Beauty',
       'Category_Board', 'Category_Books & Reference', 'Category_Business',
       'Category_Card', 'Category_Casino', 'Category_Casual',
       'Category_Comics', 'Category_Communication', 'Category_Dating',
       'Category_Education', 'Category_Educational', 'Category_Entertainment',
       'Category_Events', 'Category_Finance', 'Category_Food & Drink',
       'Category_Health & Fitness', 'Category_House & Home',
       'Category_Libraries & Demo', 'Category_Lifestyle',
       'Category_Maps & Navigation', 'Category_Medical', 'Category_Music',
       'Category_Music & Audio', 'Category_News & Magazines',
       'Category_Parenting

In [17]:
# scaler = StandardScaler()
# columns_to_scale = ['Rating', 'Rating Count', 'Installs', 'Price', 'Size_M', 'Released_Year', 'Released_Month', 'Days_Between']
# df_A[columns_to_scale] = scaler.fit_transform(df_A[columns_to_scale])

In [18]:
df_A

Unnamed: 0,Rating,Rating Count,Installs,Free,Price,Size_M,Ad Supported,In App Purchases,Editors Choice,Released_Year,...,Category_Travel & Local,Category_Trivia,Category_Video Players & Editors,Category_Weather,Category_Word,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,0.829038,0.194000,0.376387,1,-0.060273,-0.307280,0,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
1,0.876499,0.629323,1.452899,1,-0.060273,-0.309289,0,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
2,-1.021950,-0.083483,-0.114009,0,0.618262,-0.283173,0,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
3,0.496810,-0.073934,-0.072169,1,-0.060273,-0.309289,1,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
4,0.781577,-0.066632,-0.082716,1,-0.060273,-0.309289,0,0,0,-5.398248,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15129,1.208728,-0.059329,-0.080313,1,-0.060273,-0.202814,1,0,0,0.747757,...,False,False,False,False,False,True,False,False,False,False
15130,-1.021950,-0.083483,-0.114519,1,-0.060273,-0.178706,0,0,0,0.747757,...,False,False,False,False,False,True,False,False,False,False
15131,-1.021950,-0.083483,-0.113203,1,-0.060273,-0.206832,0,0,0,0.747757,...,False,False,False,False,False,True,False,False,False,False
15132,-1.021950,-0.083483,-0.114558,1,-0.060273,-0.178706,1,0,0,0.747757,...,False,False,False,False,False,True,False,False,False,False


In [19]:
X = df_A[df_A.columns.difference(['Daily_Avg_Installs'])]
y = df_A.Daily_Avg_Installs

In [20]:
y

0         99306.0
1        317027.0
2           125.0
3          8587.0
4          6454.0
           ...   
15129      6940.0
15130        22.0
15131       144.0
15132        14.0
15133        51.0
Name: Daily_Avg_Installs, Length: 15134, dtype: float64

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

### Plynomial

In [25]:
polynomial = PolynomialFeatures(degree=2, include_bias= False, interaction_only = False)

X_train_poly = polynomial.fit_transform(X_train)
X_test_poly = polynomial.transform(X_test)

polynomial.get_feature_names_out()

array(['Ad Supported', 'Category_Action', 'Category_Adventure', ...,
       'Released_Year^2', 'Released_Year Size_M', 'Size_M^2'],
      dtype=object)

In [26]:
linear_pol_model = LinearRegression()
linear_pol_model.fit(X_train_poly, y_train)
pred_poly = linear_pol_model.predict(X_test_poly)

In [27]:
root_mean_squared_error(y_test, pred_poly)

88991.92630708315

In [28]:
r2_score(y_test, pred_poly)

0.8833061406499022