In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [87]:
df=pd.read_csv("data_modified1.csv")

# Missing Values

In [88]:
numerical_with_nan=[feature for feature in df.columns if df[feature].isnull().sum()>1 and df[feature].dtypes!='O']
for feature in numerical_with_nan:
    print("{}: {}% missing value".format(feature,np.around(df[feature].isnull().mean(),4)))

Battery in mAh: 0.0625% missing value
Ram in GB: 0.0694% missing value


In [89]:
for feature in numerical_with_nan:
    median_value=df[feature].median()
    df[feature].fillna(median_value,inplace=True)

In [90]:
df[numerical_with_nan].isnull().sum()

Battery in mAh    0
Ram in GB         0
dtype: int64

In [91]:
categorical_features=[feature for feature in df.columns if df[feature].dtype=='O']
categorical_features

['product_brand', 'Product_model', 'Colour']

In [92]:
categorical_with_nan=[feature for feature in categorical_features if df[feature].isnull().sum()>=1]
for feature in categorical_with_nan:
    print("{}: {}% missing value".format(feature,np.around(df[feature].isnull().mean(),4)))

Product_model: 0.0017% missing value
Colour: 0.3299% missing value


In [93]:
for feature in categorical_features:
    labels_ordered=df.groupby([feature])['price'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    df[feature]=df[feature].map(labels_ordered)

In [94]:
df.head(60)

Unnamed: 0,product_brand,Product_model,Colour,price,rating,Product_dimention in cm,Battery in mAh,Ram in GB,Rom in GB,Back_Camera in MP,Front_Camera in MP
0,54,273.0,124.0,9499.0,4.4,15.8,5000.0,4.0,64.0,12.0,8.0
1,53,219.0,70.0,16999.0,4.4,16.51,4300.0,8.0,128.0,64.0,16.0
2,54,273.0,65.0,9499.0,4.4,15.8,5000.0,4.0,64.0,12.0,8.0
3,53,258.0,101.0,19999.0,4.4,16.76,4300.0,8.0,128.0,64.0,16.0
4,47,134.0,88.0,7499.0,4.5,13.84,4000.0,4.0,64.0,13.0,8.0
5,53,160.0,12.0,9999.0,4.4,16.56,5000.0,4.0,64.0,12.0,5.0
6,53,160.0,9.0,8999.0,4.4,16.56,5000.0,3.0,32.0,12.0,5.0
7,53,160.0,12.0,8999.0,4.4,16.56,5000.0,3.0,32.0,12.0,5.0
8,53,160.0,9.0,9999.0,4.4,16.56,5000.0,4.0,64.0,12.0,5.0
9,53,219.0,70.0,13999.0,4.4,16.51,4300.0,4.0,64.0,64.0,16.0


In [95]:
median_value=df['Colour'].median()
df['Colour'].fillna(median_value,inplace=True)

In [96]:
median_value=df['Product_model'].median()
df['Product_model'].fillna(median_value,inplace=True)

In [97]:
df.apply(lambda x: sum(x.isnull()),axis=0)

product_brand              0
Product_model              0
Colour                     0
price                      0
rating                     0
Product_dimention in cm    0
Battery in mAh             0
Ram in GB                  0
Rom in GB                  0
Back_Camera in MP          0
Front_Camera in MP         0
dtype: int64

In [98]:
df.dtypes

product_brand                int64
Product_model              float64
Colour                     float64
price                      float64
rating                     float64
Product_dimention in cm    float64
Battery in mAh             float64
Ram in GB                  float64
Rom in GB                  float64
Back_Camera in MP          float64
Front_Camera in MP         float64
dtype: object

# Normalization

In [99]:
for feature in df.columns:
    if 0 in df[feature].unique():
        pass
    else:
        df[feature]=np.log(data[feature])

# Removing Outliers

In [100]:
Q1=df.quantile(0.25)
Q3=df.quantile(0.75)
IQR=Q3-Q1
df_out=df[~((df>(Q1+1.5*IQR))|(df<(Q1-1.5*IQR))).any(axis=1)]

# Model

In [101]:
x=df_out.iloc[:,[0,1,2,4,5,6,7,8,9,10]]
y=df_out.iloc[:,3]
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)
from sklearn.linear_model import LinearRegression
classifier=LinearRegression()
classifier.fit(x_train, y_train)
y_pred=classifier.predict(x_test)

In [102]:
from sklearn.metrics import r2_score
score=r2_score(y_test,y_pred)

In [103]:
score

0.9791497523913003