In [1]:
import pandas as pd
import numpy as np
import seaborn as sbn
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv("tenis.csv")
data.head()

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,85,85,False,no
1,sunny,80,90,True,no
2,overcast,83,86,False,yes
3,rainy,70,96,False,yes
4,rainy,68,80,False,yes


In [3]:
print(data.isna().sum())
print("\n------------------------------------")
print(data.duplicated().sum())
print("\n------------------------------------")
print(data.describe())

outlook        0
temperature    0
humidity       0
windy          0
play           0
dtype: int64

------------------------------------
0

------------------------------------
       temperature   humidity
count    14.000000  14.000000
mean     73.571429  81.642857
std       6.571667  10.285218
min      64.000000  65.000000
25%      69.250000  71.250000
50%      72.000000  82.500000
75%      78.750000  90.000000
max      85.000000  96.000000


### Encoding
- Encoding islemi kategorik verileri modelin anlayacagi sekilde sayisal gosterime cevirmesidir.
- Label Encoding -> her bir kategoriyi sayisal bir etiketle temsil eder.
- OneHotEncoding -> her kategoriyi ikili (binary) bir vektorle temsil eder. Her bir kategori icin bir sutun olusturur.

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [6]:
le = LabelEncoder()
data["windy"] = le.fit_transform(data["windy"])
data["play"] = le.fit_transform(data["play"])
data.head()

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,85,85,0,0
1,sunny,80,90,1,0
2,overcast,83,86,0,1
3,rainy,70,96,0,1
4,rainy,68,80,0,1


In [7]:
ohe = OneHotEncoder()
data_ohe = ohe.fit_transform(data[['outlook']]).toarray()

In [8]:
data_ohe = pd.DataFrame(data_ohe, columns=ohe.get_feature_names_out(['outlook']))
data_ohe.head()

Unnamed: 0,outlook_overcast,outlook_rainy,outlook_sunny
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,1.0,0.0


In [9]:
data = pd.concat([data, data_ohe], axis=1)

In [10]:
data.drop(columns="outlook",inplace=True)

In [11]:
data

Unnamed: 0,temperature,humidity,windy,play,outlook_overcast,outlook_rainy,outlook_sunny
0,85,85,0,0,0.0,0.0,1.0
1,80,90,1,0,0.0,0.0,1.0
2,83,86,0,1,1.0,0.0,0.0
3,70,96,0,1,0.0,1.0,0.0
4,68,80,0,1,0.0,1.0,0.0
5,65,70,1,0,0.0,1.0,0.0
6,64,65,1,1,1.0,0.0,0.0
7,72,95,0,0,0.0,0.0,1.0
8,69,70,0,1,0.0,0.0,1.0
9,75,80,0,1,0.0,1.0,0.0


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
y_ = data.iloc[:,1:2]
x_ = data.iloc[:,2:]
print("y_ :\n", y_)
print("x_ :\n", x_)

y_ :
     humidity
0         85
1         90
2         86
3         96
4         80
5         70
6         65
7         95
8         70
9         80
10        70
11        90
12        75
13        91
x_ :
     windy  play  outlook_overcast  outlook_rainy  outlook_sunny
0       0     0               0.0            0.0            1.0
1       1     0               0.0            0.0            1.0
2       0     1               1.0            0.0            0.0
3       0     1               0.0            1.0            0.0
4       0     1               0.0            1.0            0.0
5       1     0               0.0            1.0            0.0
6       1     1               1.0            0.0            0.0
7       0     0               0.0            0.0            1.0
8       0     1               0.0            0.0            1.0
9       0     1               0.0            1.0            0.0
10      1     1               0.0            0.0            1.0
11      1     1          

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x_,y_,test_size=0.3,random_state=42)
reg = LinearRegression().fit(x_train,y_train)

In [15]:
y_pred = reg.predict(x_test)

In [16]:
print("Tahmin sonucu : \n" ,y_pred)
print("\n")
print("Gerçek sonuç : \n ", y_test)

Tahmin sonucu : 
 [[86.28947368]
 [70.02631579]
 [96.26315789]
 [80.97368421]
 [94.42105263]]


Gerçek sonuç : 
      humidity
9         80
11        90
0         85
12        75
5         70


### Dummy Variable Trap

- Bu problem genellikle OneHotEncoding isleminde ortaya cikar.
- OneHotEncoding ile bir sutunda bulunan kategorik veriler ornegin :
- s1 s2 s3   -> sutunlar
- 1 0 0 
- 0 1 0 
- sekilde veri cesiti sayisi kadar sutuna ayrilmisti. Model egitilirken tum sutunlari veri olarak vermek "Dummy Variable Trap" yani kukla degisken tuzagi problemini olusturur. 
- One-Hot Encoding her kategoriyi temsil eden bir sütun olusturdugunda, bu sutunlardan biri diger sutunların toplami olarak ifade edilebilir. Bu durumda, modelin katsayilari arasinda coklu dogrusal baglilik (multicollinearity) olusur ve bu durum modelin performansini etkileyebilir.

- Cozum icin : 
en yaygin yaklasim, bir kategoriyi referans olarak secmek ve sadece diger kategorileri dummy degiskenler olarak kullanmaktir.
Ornegin yukarida overcast'i referans olarak secip , rainy ve sunny'i modelde egitmek uzere secebiliriz. Bu sekilde coklu dogrusal baglilik problemi ortadan kalkar.