In [None]:
import pandas as pd
from unidecode import unidecode
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

In [None]:
df = pd.read_csv('data.csv').drop_duplicates()
df.head()

In [None]:
df.info()

In [None]:
df['token'].nunique()

In [None]:
df = df[df['مدل (سال تولید)'] != 'قبل از ۱۳۶۶']

In [None]:
df['کارکرد'] = df['کارکرد'].apply(lambda x: unidecode(x)).str.replace(',', '').astype(int)

In [None]:
df['مدل (سال تولید)'] = df['مدل (سال تولید)'].apply(lambda x: unidecode(x)).str.replace(',', '').astype(int)

In [None]:
df['مهلت بیمهٔ شخص ثالث'] = df['مهلت بیمهٔ شخص ثالث'].str.replace('ماه', '').apply(lambda x: unidecode(str(x))).astype(float)

In [None]:
df['قیمت'] = df['قیمت'].str.replace('تومان', '').apply(lambda x: unidecode(x)).str.replace(',', '').str.extract('(\d+)').astype(float)

In [None]:
df.head()

In [None]:
data = df.copy().drop(['token', 'description', 'نمایشگاه', 'فروشنده', 'وضعیت شاسی عقب', 'وضعیت شاسی جلو', 'برند و تیپ'], axis = 1)

In [None]:
categorical_cols = []
for col in data.columns:
    if data[col].dtypes == 'object':
        categorical_cols.append(col)
print(categorical_cols)

In [None]:
data = pd.concat([data, pd.get_dummies(data[categorical_cols], drop_first=True)], axis=1)
data.drop(categorical_cols, axis=1, inplace=True)
data.head()

In [None]:
for col in categorical_cols:
    print(col, df[col].nunique())

In [None]:
X = data.dropna().iloc[:, data.columns!='قیمت']
y = data.dropna().iloc[:, 3]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
regressor_lr = LinearRegression()
regressor_lr.fit(X_train, y_train)

In [None]:
regressor_lr.score(X_train, y_train)

In [None]:
regressor_lr.score(X_test, y_test)