In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score
warnings.filterwarnings('ignore')

In [4]:
hp_df=pd.read_csv('House price train.csv')

y=hp_df['SalePrice']

In [None]:
# identifying cols with NAs greater than 50 percentage(no such rule)


na_df=pd.DataFrame({'col_name':hp_df.columns,'na_pc':hp_df.isnull().sum()/hp_df.shape[0]*100})

col_grt50pc_nas=list(na_df[na_df['na_pc']>50]['col_name'])

In [84]:
# identifying the non sense columns
cols_to_drop=[]
for col in hp_df.columns:
    if hp_df[col].nunique()==1 or hp_df[col].nunique()==hp_df.shape[0]:
        cols_to_drop.append(col)
        
cols_to_drop=cols_to_drop+col_grt50pc_nas

cols_to_drop.append('SalePrice')

cols_to_drop

['Id', 'Alley', 'PoolQC', 'Fence', 'MiscFeature', 'SalePrice']

In [85]:
hp_df.drop(columns=cols_to_drop,inplace=True)# dropping all the identified columns(from the above steps)

In [86]:
# Train Test Split

X_train,X_test,y_train,y_test=train_test_split(hp_df,y,test_size=0.2,random_state=42)

In [87]:
cat_cols=[col for col in hp_df.columns if hp_df[col].dtype=='object']
con_cols=[col for col in hp_df.columns if hp_df[col].dtype=='int64' or hp_df[col].dtype=='float64']

In [88]:
# Filling missing values(mean for the continuous variables, mode for the categoricals)

for col in con_cols:
    X_train[col].fillna(X_train[col].mean(),inplace=True)
    X_test[col].fillna(X_train[col].mean(),inplace=True)# filling the test as well with train mean
    
    
for col in cat_cols:
    X_train[col].fillna(X_train[col].mode()[0],inplace=True)
    X_test[col].fillna(X_train[col].mode()[0],inplace=True)


In [89]:
# scaling the continuous variables

scaler=StandardScaler()

for col in con_cols:
    X_train[col]=scaler.fit_transform(np.array(X_train[col]).reshape(-1,1))
    X_test[col]=scaler.transform(np.array(X_test[col]).reshape(-1,1))
    
    

In [90]:
oe_train=pd.get_dummies(X_train[cat_cols])
oe_test=pd.get_dummies(X_test[cat_cols])

In [98]:
oe_train_final,oe_test_final=oe_train.align(oe_test,join='inner',axis=1)

In [100]:
X_train_final=pd.concat([X_train[con_cols],oe_train_final],axis=1)
X_test_final=pd.concat([X_test[con_cols],oe_test_final],axis=1)

In [113]:
linreg=LinearRegression()

linreg.fit(X_train_final,y_train)

test_pred=linreg.predict(X_test_final)

train_pred=linreg.predict(X_train_final)

In [114]:

print('Train Score:',r2_score(y_train,train_pred))
print('Test Score:',r2_score(y_test,test_pred))

Train Score: 0.9081047887784507
Test Score: 0.874618595109526


In [64]:
train=['Hyderabad','Chennai','Bangalore','Hyderabad','Chennai','Pune','Bangalore']

pd.get_dummies(train)

Unnamed: 0,Bangalore,Chennai,Hyderabad,Pune
0,0,0,1,0
1,0,1,0,0
2,1,0,0,0
3,0,0,1,0
4,0,1,0,0
5,0,0,0,1
6,1,0,0,0


In [66]:
test=['Hyderabad','Mumbai','Ahmedabad','Bangalore','Mumbai','Hyderabad','Kochi','Kochi']

pd.get_dummies(test)

Unnamed: 0,Ahmedabad,Bangalore,Hyderabad,Kochi,Mumbai
0,0,0,1,0,0
1,0,0,0,0,1
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,0,0,1
5,0,0,1,0,0
6,0,0,0,1,0
7,0,0,0,1,0
