                                                                         
## Project Name: 
## Make a model to predict the app rating, with other information about the app provided.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
appdata = pd.read_csv('googleplaystore.csv')

In [None]:
appdata.head(5)

In [None]:
appdata.tail(5)

In [None]:
len(appdata)

## Null Values

In [None]:
appdata.isna().sum()

## Drop records with nulls in any of the columns.

In [None]:
appdata = appdata.dropna(axis = 0, how = 'any')

In [None]:
appdata

In [None]:
appdata.isna().sum()

# convert "size" column Mb to Kb

In [None]:
sizeunique = appdata['Size'].unique()

In [None]:
sizeunique

In [None]:
def size(value):
    if value[-1:] == 'M':
        return float(value[:-1])*1000
    elif value[-1:] == 'k':
        return float(value[:-1])
    else:
        return np.nan

In [None]:
appdata['Size'] = appdata['Size'].map(lambda x: size(x))

In [None]:
sizeunique = appdata['Size'].unique()
sizeunique

In [None]:
appdata.Size.describe()

In [None]:
appdata.isna().sum()

In [None]:
'''
for i in appdata.columns[appdata.isnull().any(axis=0)]:     #---Applying Only on variables with NaN values .. fill mean at N
    appdata[i].fillna(appdata[i].mean(),inplace=True)
'''

In [None]:
#appdata.Size.unique()

In [None]:
#appdata.isna().sum()

# Type of data

In [None]:
appdata.dtypes

## convert object to numeric

In [None]:
appdata['Reviews'] = pd.to_numeric(appdata['Reviews'])

In [None]:
appdata.Reviews.dtypes

# remove "+," from "Install" column and convert in float

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
appdata['Installs']=appdata['Installs'].astype(str)     # converting datatype to string

In [None]:
appdata['Installs']=appdata['Installs'].str.replace(',','')    # removing characters

In [None]:
appdata

In [None]:
appdata['Installs']=appdata['Installs'].str.replace('+','')   #removing +

In [None]:
appdata

In [None]:
appdata['Installs']=appdata['Installs'].astype(float)

In [None]:
appdata['Installs'].dtypes

# 4. Price field is a string and has 'dollar' symbol. Remove and convert it to numeric.

remove $ and convert Price column to numeric
 

In [None]:
appdata['Price'] = appdata['Price'].astype(str)    # converting to string

In [None]:
appdata['Price'] = appdata['Price'].str.replace('$','')

In [None]:
appdata['Price'].unique()

In [None]:
appdata['Price'] = pd.to_numeric(appdata['Price'])
appdata['Price'].dtypes

### .................................observations...............................
    Price column has some values which are very high and these values will act like outliers.
    need to replace them with average value.
    

# 5. Sanity checks:
 

### 1. Average rating should be between 1 and 5 as only these values are allowed on the play store.

In [None]:
appdata.Rating.max()      # checking for miximum rating

In [None]:
appdata.Rating.min()      # checking for minimum rating

### .....................................observation................................
    The minimum and maximum rating is in between 1 to 5.
    No rating below 1 and above 5.

### 2. Reviews should not be more than installs as only those who installed can review the app.

In [None]:
appdata['Reviews'] = appdata['Reviews'].where(appdata['Reviews']<=appdata['Installs'])
appdata['Reviews']

In [None]:
appdata['Reviews'].isnull().sum()

In [None]:
appdata = appdata.dropna()

In [None]:
appdata.shape


### ...............................observations...................................
    In 7 records reviews are more than installs.
    It means these are fake records, and not important to keep.
    so, I dropped such records.
    
    

### 3. For free apps (type = “Free”), the price should not be >0.

In [None]:
free_price = appdata['Type'].where(appdata['Price']>0)    # to saggregate data where price>0 .... if we get 'free' then we have to remove it 

In [None]:
free_price.unique()      # here we are not seeing free in output .... it means there is no free app with price.


### ....................................observations...............................
    1. by using where command we will get all those records which has value more than zero.
    2. if free apps charging price then 'Free' will appear in the results.
    
    There is no 'Free' app which has price more than 0. 

# 5. Performing univariate analysis:

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### • Boxplot for Price

In [None]:
plt.figure(figsize=(12,3))
sns.boxplot(x=appdata.Price)


### .............................observations............................

    1. In above data 'Price' of apps are in between 0 to 400 dollar.
    2. Generally average price of any app on play store is around 9 dollar.
    3. App having prices of 200 or more are too expensive for a customer to buy.
    4. those apps may be fake also.

### • Boxplot for Reviews
   

In [None]:
plt.figure(figsize=(16,3))
sns.boxplot(x=appdata.Reviews.values)
plt.xlabel('Reviews')

plt.figure(figsize=(16,3))
sns.boxplot(x=appdata.Reviews.values)
plt.xlabel('Reviews')
plt.xscale('log')                          #  log 

### .............................observations................................
    
    The values aren't spread well.
    More values are concentrated within 0.1 million(1 lakh) reviews.
    It has so many outliers.
    To normalize this data we need a perticular threshold value
    

### • Histogram for Rating
   

In [None]:
plt.figure(figsize=(16,6))
sns.histplot(data=appdata, x='Rating', binwidth=0.1)
plt.title('Rating')

### ......................................observations..............................
    The distribution of data is not normal.
    The rating values are more towards higher ratings.
    

### • Histogram for Size


In [None]:
plt.figure(figsize=(16,6))
sns.histplot(data=appdata, x='Size', binwidth=2000)  
plt.title('Size (kb)')

### .................................observations.................................
    
    The data of 'Size' is more towards lower size.
    In above for plots Price and Reviews are have outliers and their impact is high.
    However, 'Rating' and 'Size' are not distributed evenly.

# 6. Outlier treatment


### 1. Price: 
    From the box plot, it seems like there are some apps with very high price. 
    A price of dollar 400 for an application on the Play Store is very high and suspicious.

In [None]:
appdata['Price'].sort_values(ascending=False).head(20)    # checking records with very high price

In [None]:
# droping records which have values more than 200

appdata['Price'] = appdata['Price'].where(appdata['Price']<200)

In [None]:
appdata['Price'].sort_values(ascending=False).head(20)     # after removing apps of price more than $200

### ...............................observations............................

    $400 is indeed a very high price for any app
    15 records are there where the price is more than $200.
    other all records price is not more than $80.
    apps having price more than $200 are of no use for analysis.
    
    after dropping record highest price for an app is not more than $80.

### 2. Reviews: 
    Very few apps have very high number of reviews. 
    These are all star apps that don’t help with the analysis and, in fact, will skew it. 
    Droping records having more than 2 million reviews.

In [None]:
appdata.sort_values(['Reviews'], ascending=False).head(50)   # sorting data by reviews in descending order

In [None]:
appdata['Reviews'] = appdata['Reviews'].where(appdata['Reviews']<2000000)   # dropind records of more than 2 million reviews

In [None]:
appdata.sort_values(['Reviews'], ascending=False).head(50)   # sorting data by reviews in descending order

In [None]:
plt.figure(figsize=(14,3))
sns.boxplot(x=appdata.Reviews)
plt.xscale('log')

### 3. Installs: 
    
    Apps having very high number of installs should be dropped from the analysis.

In [None]:
q1,q2,q3,q4,q5,q6,q7 = np.percentile(appdata.Installs,[10,25,50,70,90,95,99])   # getting percentiles

In [None]:
print(f'''q10={q1} q25={q2} q50={q3} q70={q4} q90={q5} q95={q6} q99={q7}''')

### .........................................observations.......................................
    1. 50% of installs are under 0.1 million.
    2. 70% of installs are under 1 million.
    3. 90% of installs are under 10 million.
    4. 95% of installs are under 50 million.
    
    my most of the data is under 10 million.
    90% of data lies in between 1 install to 10 million installs.
    remaining 10% data lies in between 10 million to 100 million installs.
    
    here I will take 90 percentile to drop outlier records.

In [None]:
appdata['Installs'].describe()


### ..................................observations.........................................

    1. maximum installs = 100000000 (100 million)
    2. minimum installs = 1
    3. mean of installs = 8424070 (8.2 million)
    4. standard deviation = 50157420 (50.15 million)
    
    outliers are affecting mean.
    

###  drop records from installs more than threshold value

In [None]:
appdata['Installs']= appdata.Installs.where(appdata['Installs']<=10000000)
appdata

In [None]:
appdata.isnull().sum()

### .........................observations...........................

    1. Total 388 records have been dropped.
    2. Column Reviews, Installs and Price have some nan values.

# 7. Bivariate analysis: Let’s look at how the available predictors relate to the variable 

### 1. scatter plot for Rating vs. Price

In [None]:
# scatterplot

plt.figure(figsize=(13,5))
sns.scatterplot(x=appdata.Price, y=appdata.Rating)

### ..........................observations.............................

    1. Maximum apps are free.
    2. apps with high price don't have low ratings.
    3. apps with price have more higher ratings

### scatter plot for Rating vs. Size

In [None]:
# scatterplot

plt.figure(figsize=(13,5))
sns.scatterplot(x=appdata.Size, y=appdata.Rating)


### ..............................observations................................

    1. It looks heavier apps have slightly high ratings than lighter.
    2. Rating is concentrated towards higher rating
    

### scatter plot for Rating vs. Reviews

In [None]:
# scatterplot

plt.figure(figsize=(13,5))
sns.scatterplot(x=appdata.Reviews, y=appdata.Rating)
plt.xscale('log')


### ..............................observations...........................

    1. Looks like more the review better the rating.
    2. concentration of data towards higher rating increases as the reviews increase.
    

In [None]:
# scatterplot

plt.figure(figsize=(16,5))
sns.jointplot(y=appdata.Reviews, x=appdata.Rating)
#plt.xscale('log')

### boxplot for Rating vs. Content Rating

In [None]:
# boxplot

plt.figure(figsize=(13,5))
sns.boxplot(x=appdata['Rating'], y=appdata['Content Rating'])

### .............................observations...............................

    here lower rating is an outlier.
    values are more towards higher rating.
    Adults only genre has good ratings.

### boxplot for Ratings vs. Category

In [None]:
# boxplot

plt.figure(figsize=(10,14))
sns.boxplot(x=appdata['Rating'], y=appdata['Category'])

# 8. Data preprocessing

###  creating copy of dataframe

In [None]:
inp1 = appdata.copy()
inp1

In [None]:

sns.distplot(x=inp1['Reviews'], kde=True)

In [None]:

sns.distplot(x=inp1['Installs'], kde=True)

### ...................................observations.......................................
    1. both Reviews and Installs are not normally distributed.
    2. still they have outliers.

In [None]:
# log transformation for Reviews and Install
inp1['Reviews'] = np.log1p(inp1.Reviews)

In [None]:
inp1['Installs'] = np.log1p(inp1.Installs)

In [None]:
inp1

In [None]:
sns.distplot(x=inp1['Reviews'], kde=True)

In [None]:
sns.distplot(x=inp1['Installs'], kde=True)

### drop unnecessary columns

In [None]:
inp1.columns

In [None]:
del inp1['App']
del inp1['Last Updated']
del inp1['Current Ver']
del inp1['Android Ver']

In [None]:
inp1.columns

In [None]:
inp1

### checking NAN values

In [None]:
inp1.isnull().sum()

In [None]:
inp1.describe()

### getting median

In [None]:
import statistics

In [None]:
statistics.median(inp1.Reviews)

In [None]:
statistics.median(inp1.Installs)

In [None]:
statistics.median(inp1.Price)

### replacing NAN values with median

In [None]:
for i in inp1.columns[inp1.isnull().any(axis=0)]:     #---Applying Only on variables with NaN values
    inp1[i].fillna(inp1[i].median(),inplace=True)

In [None]:
inp1.isnull().sum()

###  dummy encoding

In [None]:
inp2 = pd.get_dummies(data=inp1, columns=['Category','Content Rating','Genres','Type'])

In [None]:
pd.set_option('display.max_columns',200)
inp2

In [None]:
x=inp2.iloc[:,1:]

In [None]:
x

In [None]:
y=inp2.loc[:,['Rating']]

In [None]:
y

## 9. splitting data into training and testing

In [None]:
from sklearn.model_selection import train_test_split  
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=0)  

## 11. Model building

### Fittinglinear regression model into the training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor=LinearRegression()
regressor.fit(x_train, y_train)     # giving training to model

### predicting the test set results 

In [None]:
y_pred = regressor.predict(x_test)

In [None]:
y_pred

In [None]:
y_test

In [None]:
x_test.shape

In [None]:
y_test.shape

###  Visualising the results

In [None]:
plt.scatter(y_test, y_pred)
plt.show()

In [None]:
sns.regplot(x=y_test,y=y_pred,ci=None)

## 12. reporting r2

In [None]:
from sklearn.metrics import r2_score

In [None]:
accuracy = r2_score(y_test,y_pred)
accuracy

###  difference between y_test and y_pred

In [None]:
(y_test-y_pred).max()

In [None]:
(y_test-y_pred).min()

In [None]:
(y_test-y_pred).describe()     # describing difference between test and predicted results

## Observations

    The model has very low accuracy.
    Other parameters also have to consider like users gender, age, likes, users total use, region etc.
    This means the data is insufficient to predict rating accurately.
    after changing test_size also the accuracy is low.
    
    