In [3]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
df = pd.read_csv("../data/raw/istihdam-verileri.csv")

# Delete missing values ​​in the target variable.
df = df.dropna(subset=['istihdam_orani'])

print("After the missing data has been cleaned:")
print(df.isna().sum())

After the missing data has been cleaned:
yil                  0
istanbul_turkiye     0
kirilim_bir          0
kirilim_iki          0
istihdam_bin_kisi    0
istihdam_orani       0
dtype: int64


In [5]:
# It can be saved for graphical and analytical purposes, but will not be used in the model.
df = df.drop(columns=['istanbul_turkiye'])

In [7]:
le_kirilim_bir = LabelEncoder()
le_kirilim_iki = LabelEncoder()

df['kirilim_bir'] = le_kirilim_bir.fit_transform(df['kirilim_bir'])
df['kirilim_iki'] = le_kirilim_iki.fit_transform(df['kirilim_iki'])

In [9]:
print("kirilim_bir mapping:")
for i, label in enumerate(le_kirilim_bir.classes_):
    print(f"{i} --> {label}")

print("\nkirilim_iki mapping:")
for i, label in enumerate(le_kirilim_iki.classes_):
    print(f"{i} --> {label}")

print(df[['kirilim_bir', 'kirilim_iki']].dtypes)
print(df[['kirilim_bir', 'kirilim_iki']].head(10))

kirilim_bir mapping:
0 --> 0
1 --> 1
2 --> 2

kirilim_iki mapping:
0 --> 0
1 --> 1
2 --> 2
3 --> 3
4 --> 4
5 --> 5
6 --> 6
7 --> 7
8 --> 8
9 --> 9
10 --> 10
kirilim_bir    int64
kirilim_iki    int64
dtype: object
   kirilim_bir  kirilim_iki
0            0            5
1            0            6
2            2            0
3            2            1
4            2            2
5            2            3
6            2            4
7            1            9
8            1            7
9            1            8


In [10]:
X = df.drop(columns=['istihdam_orani'])
y = df['istihdam_orani']

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (220, 4)
y shape: (220,)


In [11]:
scaler = StandardScaler()

numeric_cols = ['yil', 'istihdam_bin_kisi']

X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [13]:
print("\nDataset information after preprocessing:")
print(X.info())


Dataset information after preprocessing:
<class 'pandas.core.frame.DataFrame'>
Index: 220 entries, 0 to 529
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   yil                220 non-null    float64
 1   kirilim_bir        220 non-null    int64  
 2   kirilim_iki        220 non-null    int64  
 3   istihdam_bin_kisi  220 non-null    float64
dtypes: float64(2), int64(2)
memory usage: 8.6 KB
None


In [14]:
# Combine x and y again.
df_processed = X.copy()
df_processed['istihdam_orani'] = y

# save as CSV
df_processed.to_csv(
    "../data/processed/istihdam_preprocessed.csv",
    index=False,
    encoding="utf-8"
)

print("Preprocessed data saved successfully.")

Preprocessed data saved successfully.
