In [533]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import GridSearchCV

In [534]:
df=pd.read_csv('car.csv')
df.head()

In [535]:
df=df.drop('name',axis=1) # columns = 'name'
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,2014,450000,141000,Diesel,Individual,Manual,Second Owner


### One-hot encoding

In [536]:
#dummy variables
df_cat=df[['fuel','seller_type','transmission','owner']]
df_cat=pd.get_dummies(data=df_cat,drop_first=True)
df=df[['year','selling_price','km_driven']].join(df_cat)
df.head()

Unnamed: 0,year,selling_price,km_driven,fuel_Diesel,fuel_Electric,fuel_LPG,fuel_Petrol,seller_type_Individual,seller_type_Trustmark Dealer,transmission_Manual,owner_Fourth & Above Owner,owner_Second Owner,owner_Test Drive Car,owner_Third Owner
0,2007,60000,70000,0,0,0,1,1,0,1,0,0,0,0
1,2007,135000,50000,0,0,0,1,1,0,1,0,0,0,0
2,2012,600000,100000,1,0,0,0,1,0,1,0,0,0,0
3,2017,250000,46000,0,0,0,1,1,0,1,0,0,0,0
4,2014,450000,141000,1,0,0,0,1,0,1,0,1,0,0


In [537]:
#podzial na zmienną zależną i zmienne niezależne
y=df['selling_price']
X=df.drop('selling_price',axis=1)

In [538]:
#zbuduj 2 modele
#drzewo decyzyjne
#regresja liniowa

#regresja
reg_model=LinearRegression()
reg_model.fit(X,y)
reg_pred=reg_model.predict(X)

#drzewo decyzyjne
tree_model=DecisionTreeRegressor(random_state=111,min_samples_split=400)
tree_model.fit(X,y)
tree_pred=tree_model.predict(X)

In [539]:
#sprawdz skutecznosc modeli na podstawie nastepujacych metryk

In [540]:
#R2
print('Linear Regression R2 score: '+str(round(r2_score(y,reg_pred),2)))
print('Decision Tree R2 score: '+str(round(r2_score(y,tree_pred),2)))

Linear Regression R2 score: 0.46
Decision Tree R2 score: 0.5


In [541]:
#MAE
print('Linear Regression MAE score: '+str(round(mae(y,reg_pred),2)))
print('Decision Tree MAE score: '+str(round(mae(y,tree_pred),2)))

Linear Regression MAE score: 229254.77
Decision Tree MAE score: 198304.84


In [542]:
#RMSE
print('Linear Regression RMSE score: '+str(round(mse(y,reg_pred,squared=False),2)))
print('Decision Tree RMSE score: '+str(round(mse(y,tree_pred,squared=False),2)))

Linear Regression RMSE score: 425388.24
Decision Tree RMSE score: 410557.26


### Skuteczność modeli dla samochodów napedzanych benzyną

In [543]:
#sprawdz skuteczność modeli dla samochodów napedzanych benzyną

#jeśli fuel ==1 to pokaze selling_price
y_petrol=df[df['fuel_Petrol']==1]['selling_price']

# pokaz df, gdzie fuel=1 i drop selling_price
X_petrol=df[df['fuel_Petrol']==1].drop('selling_price',axis=1)

reg_pred_petrol = reg_model.predict(X_petrol)
tree_pred_petrol = tree_model.predict(X_petrol)

print('Linear Regression R2 score: '+ str(round(r2_score(y_petrol, reg_pred_petrol),2)))
print('Decision Tree R2 score: '+ str(round(r2_score(y_petrol, tree_pred_petrol),2)))
print('')
print('Linear Regression MAE score: '+ str(round(mae(y_petrol, reg_pred_petrol),2)))
print('Decision Tree MAE score: '+ str(round(mae(y_petrol, tree_pred_petrol),2)))
print('')
print('Linear Regression RMSE score: '+ str(round(mse(y_petrol, reg_pred_petrol, squared=False),2)))
print('Decision Tree RMSE score: '+ str(round(mse(y_petrol, tree_pred_petrol, squared=False),2)))

Linear Regression R2 score: 0.11
Decision Tree R2 score: 0.3

Linear Regression MAE score: 188267.63
Decision Tree MAE score: 134524.7

Linear Regression RMSE score: 343680.95
Decision Tree RMSE score: 303619.18


### Zbiór treningowy i testowy

In [544]:
#podziel zbiór na treningowy i testowy
from sklearn.model_selection import train_test_split

# Podział na dane treningowe i testowe.
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=111)

In [545]:
#na podstawie zbioru treningowego wybierz hiperparametry dla modelu drzewa decyzyjnego
model=DecisionTreeRegressor()
gs=GridSearchCV(model,
                param_grid={'max_depth':range(1,20),
                            'min_samples_split':range(50,100,10)},
                cv=10,
                scoring='neg_mean_absolute_error')
gs.fit(X_train,y_train)

In [546]:
print(gs.best_params_)

{'max_depth': 7, 'min_samples_split': 50}


In [547]:
#przetrenuj model z wybranymi hiperparametrami
tree_model=DecisionTreeRegressor(random_state=111,min_samples_split=50,max_depth=7)
tree_model.fit(X_train,y_train)

In [548]:
#sprawdz wyniki na zbiorze testowym
tree_pred=tree_model.predict(X_test)
mae(y_test,tree_pred)

172579.00489160747

# Zadanie 2: Mieszkania

In [549]:
dane=pd.read_csv('dane_sda_mieszkania.csv')
dane.head(5)

Unnamed: 0.1,Unnamed: 0,price,rent_price,area_m2,room_number,year,floor_number,features
0,0,950000.0,400.0,64.0,2.0,2013.0,floor_2,"['meble', 'balkon', 'garaż/miejsce parkingowe'..."
1,1,439000.0,400.0,37.0,2.0,2022.0,floor_1,"['telewizja kablowa', 'internet', 'telefon', '..."
2,2,464500.0,,50.0,2.0,2023.0,floor_2,"['monitoring / ochrona', 'drzwi / okna antywła..."
3,3,391900.0,,46.5,3.0,2023.0,floor_3,"['telewizja kablowa', 'internet', 'telefon', '..."
4,4,650000.0,,54.0,3.0,2021.0,floor_4,"['monitoring / ochrona', 'meble', 'garaż/miejs..."


In [550]:
dane.dtypes

Unnamed: 0        int64
price           float64
rent_price      float64
area_m2         float64
room_number     float64
year            float64
floor_number     object
features         object
dtype: object

### Filter out null values for price

In [551]:
dane[dane.price.isnull()].shape # te które mają price null

(2670, 8)

In [552]:
dane.shape

(8894, 8)

In [553]:
dane=dane[~dane.price.isnull()] # price without nulls

In [554]:
dane.shape

(6224, 8)

### Features manipulation

In [555]:
dane.floor_number.unique()

array(['floor_2', 'floor_1', 'floor_3', 'floor_4', 'ground_floor',
       'floor_5', nan, 'floor_9', 'floor_10', 'floor_7', 'floor_6',
       'floor_8', 'floor_higher_10'], dtype=object)

In [556]:
dane.floor_number.value_counts()

floor_1            1435
ground_floor       1258
floor_2            1123
floor_3            1045
floor_4             648
floor_5             250
floor_6             133
floor_7              86
floor_higher_10      80
floor_10             52
floor_8              51
floor_9              37
Name: floor_number, dtype: int64

In [557]:
floor_dict={'floor_1':1,'floor_2':2,'floor_3':3,'floor_4':4,'floor_5':5,'floor_6':6,'floor_7':7,
            'floor_8':8,'floor_9':9,'floor_10':10,'ground_floor':0,'floor_higher_10':11}

In [558]:
dane['floor_number']=dane['floor_number'].map(floor_dict)

In [559]:
dane['room_number'].value_counts()

2.0    2478
3.0    2384
4.0     813
1.0     400
5.0     120
6.0      25
7.0       3
8.0       1
Name: room_number, dtype: int64

### Features - manual one-hot encoding

In [560]:
dane['features']

0       ['meble', 'balkon', 'garaż/miejsce parkingowe'...
1       ['telewizja kablowa', 'internet', 'telefon', '...
2       ['monitoring / ochrona', 'drzwi / okna antywła...
3       ['telewizja kablowa', 'internet', 'telefon', '...
4       ['monitoring / ochrona', 'meble', 'garaż/miejs...
                              ...                        
8889    ['domofon / wideofon', 'balkon', 'piwnica', 'd...
8890    ['telewizja kablowa', 'internet', 'telefon', '...
8891    ['telewizja kablowa', 'internet', 'drzwi / okn...
8892                                  ['balkon', 'winda']
8893      ['garaż/miejsce parkingowe', 'balkon', 'winda']
Name: features, Length: 6224, dtype: object

In [561]:
dane=dane.reset_index()

In [562]:
features_list=[]

for i in range(len(dane['features'])):
  for j in range(len(dane['features'][i].replace('[','').replace(']','').split(','))):
    features_list.append(dane['features'][i].replace('[','').replace(']','').split(',')[j])

In [563]:
# features_list

In [564]:
list(set(list(set(features_list))[i].replace("'",'').replace(' ','') for i in range(len(set(features_list)))))

['',
 'telefon',
 'systemalarmowy',
 'domofon/wideofon',
 'balkon',
 'garaż/miejsceparkingowe',
 'lodówka',
 'pom.użytkowe',
 'drzwi/oknaantywłamaniowe',
 'piwnica',
 'zmywarka',
 'meble',
 'terenzamknięty',
 'telewizor',
 'piekarnik',
 'kuchenka',
 'pralka',
 'monitoring/ochrona',
 'klimatyzacja',
 'roletyantywłamaniowe',
 'internet',
 'ogródek',
 'taras',
 'dwupoziomowe',
 'oddzielnakuchnia',
 'telewizjakablowa',
 'winda']

In [565]:
features_new=[]
for i in range(len(dane['features'])):
  features_new.append(dane['features'][i].replace('[','').replace(']','').replace("'",'').replace(' ','').split(','))

In [566]:
dane['features_new']=features_new

In [567]:
dane['features_new'][:5]

0    [meble, balkon, garaż/miejsceparkingowe, winda...
1    [telewizjakablowa, internet, telefon, domofon/...
2    [monitoring/ochrona, drzwi/oknaantywłamaniowe,...
3    [telewizjakablowa, internet, telefon, drzwi/ok...
4    [monitoring/ochrona, meble, garaż/miejsceparki...
Name: features_new, dtype: object

In [568]:
def column_based_on_feature(x,feature):
  if feature in x:
    return 1
  else:
    return 0

In [569]:
for f in list(set(list(set(features_list))[i].replace("'",'').replace(' ','') for i in range(len(set(features_list))))):
    dane[f]=dane['features_new'].apply(column_based_on_feature,feature=f)

In [570]:
dane.columns

Index(['index', 'Unnamed: 0', 'price', 'rent_price', 'area_m2', 'room_number',
       'year', 'floor_number', 'features', 'features_new', '', 'telefon',
       'systemalarmowy', 'domofon/wideofon', 'balkon',
       'garaż/miejsceparkingowe', 'lodówka', 'pom.użytkowe',
       'drzwi/oknaantywłamaniowe', 'piwnica', 'zmywarka', 'meble',
       'terenzamknięty', 'telewizor', 'piekarnik', 'kuchenka', 'pralka',
       'monitoring/ochrona', 'klimatyzacja', 'roletyantywłamaniowe',
       'internet', 'ogródek', 'taras', 'dwupoziomowe', 'oddzielnakuchnia',
       'telewizjakablowa', 'winda'],
      dtype='object')

In [571]:
dane=dane[[ 'price', 'rent_price', 'area_m2', 'room_number',
       'year', 'floor_number',
       'oddzielnakuchnia', 'telewizjakablowa', 'meble', 'dwupoziomowe',
       'terenzamknięty', 'telewizor', 'lodówka', 'systemalarmowy', 'telefon',
       'roletyantywłamaniowe', 'internet', 'zmywarka', 'balkon', 'kuchenka',
       'piekarnik', 'piwnica', 'winda', 'pralka', 'garaż/miejsceparkingowe',
       'klimatyzacja', 'taras', 'monitoring/ochrona', 'ogródek',
       'domofon/wideofon', 'drzwi/oknaantywłamaniowe', 'pom.użytkowe']]

### Rent price - full fill with '-99', Year - remove null

In [572]:
# Elementy mające wartość null
dane['rent_price'][dane.rent_price.isnull()]

2      NaN
3      NaN
4      NaN
6      NaN
7      NaN
        ..
6214   NaN
6216   NaN
6219   NaN
6222   NaN
6223   NaN
Name: rent_price, Length: 3496, dtype: float64

In [573]:
 dane['rent_price'][dane.rent_price.isnull()]=-99

In [574]:
dane['rent_price']

0       400.0
1       400.0
2       -99.0
3       -99.0
4       -99.0
        ...  
6219    -99.0
6220    400.0
6221    600.0
6222    -99.0
6223    -99.0
Name: rent_price, Length: 6224, dtype: float64

In [575]:
dane['floor_number'][dane['floor_number'].isnull()]=-1

In [576]:
dane=dane[~dane['year'].isnull()]

In [577]:
dane.head()

Unnamed: 0,price,rent_price,area_m2,room_number,year,floor_number,oddzielnakuchnia,telewizjakablowa,meble,dwupoziomowe,...,winda,pralka,garaż/miejsceparkingowe,klimatyzacja,taras,monitoring/ochrona,ogródek,domofon/wideofon,drzwi/oknaantywłamaniowe,pom.użytkowe
0,950000.0,400.0,64.0,2.0,2013.0,2.0,0,0,1,0,...,1,0,1,0,0,0,0,0,0,1
1,439000.0,400.0,37.0,2.0,2022.0,1.0,0,1,1,0,...,0,0,1,0,0,0,0,1,0,0
2,464500.0,-99.0,50.0,2.0,2023.0,2.0,0,0,0,0,...,1,0,0,0,0,1,0,1,1,0
3,391900.0,-99.0,46.5,3.0,2023.0,3.0,0,1,0,0,...,0,0,1,0,0,1,0,1,1,1
4,650000.0,-99.0,54.0,3.0,2021.0,4.0,0,0,1,0,...,1,0,1,0,0,1,0,0,0,0


In [578]:
X=dane.drop('price', 1)# axis=1
y=dane['price']

  X=dane.drop('price', 1)# axis=1


In [579]:
model=LinearRegression()
model.fit(X,y)

In [580]:
model.intercept_

383799.6282289954

In [581]:
model.coef_

array([ 1.66088171e+02,  2.13449932e-02,  2.28599305e+05, -1.70849081e+02,
        2.14908355e+04, -9.52208285e+03, -1.86780103e+05,  3.37082409e+04,
       -8.18188369e+04,  8.18999654e+04,  2.00128677e+05, -1.74953671e+04,
        1.37364659e+05, -8.80453300e+03,  7.60663314e+04,  9.84320204e+04,
        9.71859409e+04, -1.04996427e+05, -4.53036399e+04, -4.89313529e+04,
       -7.81814428e+04,  5.60280441e+04,  1.79799973e+04,  2.88017817e+04,
        3.07622150e+05,  1.39947257e+05,  1.63098026e+05, -7.21126930e+04,
       -2.97295936e+04, -2.78373436e+04,  3.08447443e+04])

In [582]:
reg_predict=model.predict(X)

In [583]:
tree_model=DecisionTreeRegressor(max_depth=5,min_samples_leaf=300)
tree_model.fit(X,y)

In [584]:
tree_predict=tree_model.predict(X)

In [585]:
#R2
print('Linear Regression R2 score: '+str(round(r2_score(y,reg_predict),2)))
print('Decision Tree R2 score: '+str(round(r2_score(y,tree_predict),2)))

Linear Regression R2 score: 0.34
Decision Tree R2 score: 0.43


In [586]:
#MAE
print('Linear Regression MAE score: '+str(round(mae(y,reg_predict),2)))
print('Decision Tree MAE score: '+str(round(mae(y,tree_predict),2)))

Linear Regression MAE score: 239698.07
Decision Tree MAE score: 209356.19


In [587]:
#RMSE
print('Linear Regression RMSE score: '+str(round(mse(y,reg_predict,squared=False),2)))
print('Decision Tree RMSE score: '+str(round(mse(y,tree_predict,squared=False),2)))

Linear Regression RMSE score: 397632.47
Decision Tree RMSE score: 370369.17


Rozkład zmiennej objaśnianej

Dla danych ciągłych:
- wykresy scatterplot zmienna y vs zmienna ciagla x
- heatmapa zmienne ciagle x vs zmienna y
- rozklady zmiennych ciaglych x (sprawdzenie obserwacji odstających)

Dla danych binarnych:
- boxploty zmienna binarna vs zmienna y