# verilerin lineer regredyon ile tahmin edilmesi

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


In [43]:
df = pd.read_csv('data_cleaned.csv')

In [44]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   city         25 non-null     object
 1   district     25 non-null     object
 2   neigborhood  25 non-null     object
 3   room         25 non-null     int64 
 4   living_room  25 non-null     int64 
 5   area         25 non-null     int64 
 6   age          25 non-null     int64 
 7   floor        25 non-null     int64 
 8   price        25 non-null     int64 
dtypes: int64(6), object(3)
memory usage: 1.9+ KB
None


In [45]:
df['city']=df['city'].astype('category')
df['district']=df['district'].astype('category')
df['neigborhood']=df['neigborhood'].astype('category')
df['room']=df['room'].astype('int')
df['living_room']=df['living_room'].astype('int')
df['area']=df['area'].astype('int')
df['floor']=df['floor'].astype('int')
df['age']=df['age'].astype('int')
df['price']=df['price'].astype('int')

In [46]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   city         25 non-null     category
 1   district     25 non-null     category
 2   neigborhood  25 non-null     category
 3   room         25 non-null     int64   
 4   living_room  25 non-null     int64   
 5   area         25 non-null     int64   
 6   age          25 non-null     int64   
 7   floor        25 non-null     int64   
 8   price        25 non-null     int64   
dtypes: category(3), int64(6)
memory usage: 3.1 KB
None


In [47]:
categorical_features = ['city', 'district','neigborhood']
numerical_features =['room','living_room','area','age','floor']
#bunları kullanıp tahmi dedeceğim için içinde price olmamalı!


In [48]:
full_pipeline = ColumnTransformer([
    ('num',StandardScaler(),numerical_features),
    ('cat',OneHotEncoder(handle_unknown='ignore'),categorical_features)
])
#handle_unknown da eğitme kısmında olmayan veri eger doğrulama da cıkarsa hata olmasın diye onu ignorlamak için var
#OneHotEncoder ne demek,sen nerede isen onu 1 0 binary e cevirir
#izmir mugla istanbul
#  0     0      1

In [49]:
#X bir matris y bri vektör
X = df.drop('price',axis=1)#sütun ekseni için 1
y = df['price']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)


In [51]:
#model olusturuldu
model = Pipeline([
    ('preparation',full_pipeline),
    ('model',LinearRegression())
])

In [52]:
model.fit(X_train,y_train)
#boru hattını olusturmuş olduk

In [53]:
y_pred = model.predict(X_test) #tahmin edilmiş y degerleri
mse = mean_squared_error(y_test,y_pred) #gercek ile tahmin arası hata oranı hesaplama
rmse = np.sqrt(mse)
r2 = r2_score(y_test,y_pred)#varyasyon hesaplandı



In [54]:
print(f"MSE:{mse}")
print(f"RMSE:{rmse}")
print(f"R2:{r2}")
#sonuca göre model başarısız 
# model degerleri ezberlmeiş olabilir,çözüm yolları yapılacak


MSE:2764258.8479052386
RMSE:1662.6060411009094
R2:-0.5870091284124039


In [55]:
feature_importance = model.named_steps['model'].coef_
print(feature_importance)

[ -802.36888816     0.           219.82311243  -844.64085239
  -681.11008463  -486.48956499   405.593191      79.07114799
   800.94650666  -634.26848501   224.47266978  -389.32546543
   405.593191   -1266.37179209   801.54648167   435.76598082
   224.47266978   -23.21735717  -611.05112784   -74.92780055
    79.07114799  -486.48956499   904.9336368   -389.32546543
   178.56243506   435.76598082 -1266.37179209  -611.05112784
   904.9336368   -389.32546543   801.54648167   652.91513776
  -486.48956499    79.07114799   -23.21735717   227.03075595
   -74.92780055   415.57154618  -844.01401416]


In [56]:
print("Numerical Features")
for i in range(len(numerical_features)):
    print(numerical_features[i],feature_importance[i])

Numerical Features
room -802.3688881643261
living_room 0.0
area 219.82311243245724
age -844.6408523852799
floor -681.1100846326922


In [57]:
print("Categorical Features")
for i in range(len(categorical_features)):
    # Pipeline'dan transformer'a doğru şekilde erişim
    for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
        print(
            model.named_steps['preparation'].transformers_[1][1].categories_[i][j],  # Doğru kategorilere erişim
            feature_importance[len(numerical_features) + j]  # Feature importance dizisinden doğru indeks
        )


Categorical Features
adana -486.48956499185795
balikesir 405.5931910031058
edirne 79.07114798975591
istanbul 800.9465066566391
izmir -634.2684850058957
mugla 224.47266978087563
tekirdag -389.32546543262504
altieylul -486.48956499185795
arnavutkoy 405.5931910031058
besiktas 79.07114798975591
beykoz 800.9465066566391
bodrum -634.2684850058957
bornova 224.47266978087563
buca -389.32546543262504
kadikoy 405.5931910031058
merkez -1266.3717920882382
saricam 801.5464816742045
sariyer 435.76598082074133
suleymanpasa 224.47266978087563
1_gundogan -486.48956499185795
acarlar 405.5931910031058
anadolu 79.07114798975591
ataturk 800.9465066566391
cayirbasi -634.2684850058957
ciftlikonu 224.47266978087563
dikilitas -389.32546543262504
golkoy 405.5931910031058
gultepe -1266.3717920882382
istasyon 801.5464816742045
mevlana 435.76598082074133
plevne 224.47266978087563
suadiye -23.217357169326302
turkbuku -611.0511278365683
yalikavak -74.92780054619442


In [62]:
new_data = pd.DataFrame({
    'city':['istanbul'],
    'district':['yunusemre'],
    'neigborhood':['guzelyurt'],
    'room':[3],
    'living_room':[1],
    'area':[200],
    'age':[5],
    'floor':[3]
})
print(model.predict(new_data))

[1023.68634014]


In [61]:
print(df[(df['city']=='istanbul')])

        city    district neigborhood  room  living_room  area  age  floor  \
1   istanbul     sariyer   cayirbasi     3            1   140   16      5   
2   istanbul     kadikoy     suadiye     2            1   100    5      5   
5   istanbul    besiktas   dikilitas     3            1   140   25      1   
8   istanbul  arnavutkoy     anadolu     2            1    85    1      5   
9   istanbul  arnavutkoy     anadolu     2            1    80    1      0   
10  istanbul      beykoz     acarlar     4            1   200   11      0   

    price  
1     520  
2     670  
5     320  
8     180  
9     160  
10   1100  
