# Analysing Laptop Data 


### In this file we will clean , analyze , visualize data generate meaningful insights to get appropriate solution to the problem


In [None]:
# importing data and modules 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
file_path= "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DA0101EN-Coursera/laptop_pricing_dataset_mod1.csv"
df=pd.read_csv(file_path,index_col=False)

In [None]:
df.info()

In [None]:
df.head(5)

In [None]:
df.drop(columns=['Unnamed: 0'],inplace=True)
df.columns

In [None]:
df['Manufacturer'].value_counts()
# The data holds more number of dell laptops

In [None]:
df.replace('?',np.NaN,inplace=True)


In [None]:
df.isna().sum()

In [None]:
df.describe(include='all')

In [None]:
avg=df['Weight_kg'].mean()
df['Weight_kg'].fillna(avg,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
screen_size=df['Screen_Size_cm'].value_counts().idxmax()
df['Screen_Size_cm'].fillna(screen_size,inplace=True)


In [None]:
df.dtypes

In [None]:
df.head(5)

In [None]:
# Standarlising data
df['Screen_Size_inches']=df['Screen_Size_cm']/2.54


In [None]:
df.drop(columns=['Screen_Size_cm'],inplace=True)

In [None]:
df['Weight_pounds']=df['Weight_kg']/2.205
df.drop(columns=['Weight_kg'],inplace=True)


In [None]:
# Normalising the data using normalisation method

df['CPU_frequency']=(df['CPU_frequency']-df['CPU_frequency'].min())/(df['CPU_frequency'].max()-df['CPU_frequency'].min())

In [None]:
# Data bining categorical attribute which splits the values of continous data
bins=np.linspace(df['Price'].min(),df['Price'].max(),4)
labels=['Low','Medium','High']
df['Price_Label']=pd.cut(df['Price'],bins,labels=labels,include_lowest=True)



In [None]:
plt.bar(labels,df['Price_Label'].value_counts())
plt.xlabel('Price labels')
plt.ylabel('Count')
plt.title('Price data for laptops')

In [None]:
dummy=pd.get_dummies(df,columns=['Screen'])
dummy.head(5)

In [None]:
df=dummy
df.head()

In [None]:
df.to_csv('Laptop_Data.csv',index=False)

In [None]:
df.select_dtypes(include=['int64','float64']).corr()


In [None]:
sns.boxplot(x='Category',y='Price',data=df)

In [None]:
sns.regplot(x='RAM_GB',y='Price',data=df)
print(df[['RAM_GB','Price']].corr())
plt.xlim(0)

In [None]:
from scipy import stats
data=df.select_dtypes(include=['int64','float64']).columns.values.tolist()

for col in data:
    pearson_r,p_value=stats.pearsonr(df[col],df['Price'])
    print(f'The pearson coorelation for {col} with price is {pearson_r} and p_value is {p_value}')


In [None]:
labelss=df['Category'].unique().tolist()
plt.bar(labelss,df['Category'].value_counts())
plt.xlabel('Different Categories')
plt.ylabel('Count')
plt.title('Based on different category total count of laptops')

In [None]:
df_gptest=df[['GPU','CPU_core','Price']]
grouped_text1=df_gptest.groupby(['GPU','CPU_core'],as_index=False).mean()
print(grouped_text1)

In [None]:
grouped_pivot=grouped_text1.pivot(index='GPU',columns='CPU_core')
print(grouped_pivot)

In [None]:
fig, ax = plt.subplots()
im = ax.pcolor(grouped_pivot, cmap='RdBu')

#label names
row_labels = grouped_pivot.columns.levels[1]
col_labels = grouped_pivot.index

#move ticks and labels to the center
ax.set_xticks(np.arange(grouped_pivot.shape[1]) + 0.5, minor=False)
ax.set_yticks(np.arange(grouped_pivot.shape[0]) + 0.5, minor=False)

#insert labels
ax.set_xticklabels(row_labels, minor=False)
ax.set_yticklabels(col_labels, minor=False)

fig.colorbar(im)

## Model Development

In [None]:
from scipy.stats import pearsonr
corr,p_value=pearsonr(df['CPU_frequency'],df['Price'])
print(f'{corr},{p_value}')

In [None]:
df_numeric=df.select_dtypes(include=['int64','float64'])
print(df_numeric)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error
lm=LinearRegression()
lm.fit(df[['CPU_frequency']],df[['Price']])


In [None]:
lm.coef_

In [None]:
lm.intercept_

In [None]:
predict=lm.predict(df[['CPU_frequency']])

In [None]:
predict[0:4]

In [None]:
plt.scatter(x=df[['CPU_frequency']],y=df[['Price']],color='yellow')
plt.plot(df[['CPU_frequency']],predict,color='red')
plt.title('Actual Price vs Predicted Price')

In [None]:
residuals=predict-df[['Price']]

In [None]:
sns.residplot(x=df[['Price']],y=residuals,lowess=True)

In [None]:
sns.kdeplot(df[['Price']],color='green')
sns.kdeplot(predict,color='blue')
plt.title('Actual values and predicted values probability denisty')

In [None]:
mae_slr,mse_slr=mean_absolute_error(df['Price'],predict),mean_squared_error(df['Price'],predict)
print(f'MAE {mae_slr}, MSE {mse_slr}')
print(f'R square value {lm.score(df[['CPU_frequency']],df['Price'])}')

# Multiple regression


In [None]:
x=df_numeric.drop(columns=['Price'])
y=df[['Price']]

In [None]:
lm1=LinearRegression()


In [None]:
lm1.fit(x,y)

In [None]:
results=lm1.predict(x)

In [None]:
results[0:4]

In [None]:
sns.kdeplot(y,color='red')
sns.kdeplot(results,color='blue')
plt.title('Actual value and predicted value of price')

In [None]:
residualss=y-results

In [None]:
sns.residplot(x=y,y=results,lowess=True)

In [None]:
mse_mlr,mae_mlr=mean_squared_error(df['Price'],results),mean_absolute_error(df['Price'],results)
r2=lm1.score(x,y)
print(f'MAE = {mae_mlr}, MSE={mse_mlr} ,R2={r2}')

## Polynomial Regression

In [None]:
cpu=df['CPU_frequency']
price=df['Price']
c=np.polyfit(cpu,price,3)
f=np.poly1d(c)
print(c)

In [None]:
print(f)

In [None]:
def PlotPolly(model, independent_variable, dependent_variabble, Name):
    x_new = np.linspace(independent_variable.min(),independent_variable.max(),100)
    y_new = model(x_new)

    plt.plot(independent_variable, dependent_variabble, '.', x_new, y_new, '-')
    plt.title(f'Polynomial Fit for Price ~ {Name}')
    ax = plt.gca()
    ax.set_facecolor((0.898, 0.898, 0.898))
    fig = plt.gcf()
    plt.xlabel(Name)
    plt.ylabel('Price of laptops')

In [None]:
PlotPolly(f,cpu,price,'CPU-PRICE')

In [None]:
mae_pr,mse_pr=mean_absolute_error(price,f(cpu)),mean_squared_error(price,f(cpu))


In [None]:
mae_pr

In [None]:
mse_pr

In [None]:
from sklearn.metrics import r2_score
r2_score(price,f(cpu))

### Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.pipeline import Pipeline


In [None]:
input=[('scale',StandardScaler()),('polynomial',PolynomialFeatures(include_bias=False)),('model',LinearRegression())]

In [None]:
pipe=Pipeline(input)

In [None]:
pipe.fit(x,df['Price'])


In [None]:
predicts=pipe.predict(x)
predicts[:4]

In [None]:
sns.kdeplot(df['Price'],color='r',label='Actual value')
sns.kdeplot(predicts,color='blue',label='Predicted values')

In [None]:
mae_plr=mean_absolute_error(df['Price'],predicts)

In [None]:
mse_plr=mean_squared_error(df['Price'],predicts)
r_square=pipe.score(x,df['Price'])

In [None]:
print(f'{mae_plr},{mse_plr},{r_square}')