In [None]:
import numpy as np
import pandas as pd
import sklearn 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
df_train = pd.read_csv("/content/train_data.csv", index_col=0)
df_test = pd.read_csv("/content/test_data.csv", index_col=0)

df_train.head()


In [None]:
df_test.head()

In [None]:
print(df_train.isnull().sum())
print(df_test.isnull().sum())

In [None]:
print(df_train.info())
print(df_test.info())

In [None]:
print(df_train.describe())
print(df_test.describe())

In [None]:
print(df_train.nunique())
print(df_test.nunique())

In [None]:
for i in df_train.columns:
  print(df_train[i].value_counts())

In [None]:
def text_to_num(data):
  data.drop('flight', axis=1, inplace=True)

  data['stops'] = data['stops'].replace(['zero', 'one', 'two_or_more'], [0, 1, 2])

  data['class'] = data['class'].replace(['Economy','Business'], [0, 1])

  return data



In [None]:
df_train = text_to_num(df_train)
df_test = text_to_num(df_test)

In [None]:
df_train.corr()

In [None]:
X = df_train.drop('price', axis=1)
y = df_train['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=55)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics

num_attr = ['class', 'stops', 'duration', 'days_left']
cat_attr = ['airline', 'source_city','departure_time','arrival_time','destination_city']

num_pipline = Pipeline([
    ('std', StandardScaler())
])

full_pipline = ColumnTransformer([
    ('num', num_pipline, num_attr),
    ('cat', OneHotEncoder(), cat_attr)
])



In [None]:
X_train = full_pipline.fit_transform(X_train)
X_test = full_pipline.fit_transform(X_test)

df_test = full_pipline.fit_transform(df_test)

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error



In [None]:
md = LinearRegression()
md.fit(X_train, y_train)

y_predict = md.predict(X_test)



In [None]:
print(f"MAE: {mean_absolute_error(y_test, y_predict)}")
print(f"MAE: {np.sqrt(mean_squared_error(y_test, y_predict))}")

In [None]:
md = RandomForestRegressor()
md.fit(X_train, y_train)

y_predict = md.predict(X_test)

In [None]:
print(f"MAE: {mean_absolute_error(y_test, y_predict)}")
print(f"MAE: {np.sqrt(mean_squared_error(y_test, y_predict))}")

In [None]:
md = DecisionTreeClassifier()
md.fit(X_train, y_train)

y_predict = md.predict(X_test)
       

In [None]:
print(f"MAE: {mean_absolute_error(y_test, y_predict)}")
print(f"MAE: {np.sqrt(mean_squared_error(y_test, y_predict))}")

In [None]:
md = SVR()
md.fit(X_train, y_train)

y_predict = md.predict(X_test)

In [None]:
print(f"MAE: {mean_absolute_error(y_test, y_predict)}")
print(f"MAE: {np.sqrt(mean_squared_error(y_test, y_predict))}")

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

y_predict = model.predict(df_test)
y_predict

In [None]:
sample = pd.read_csv("/content/sample_solution.csv", index_col=0)
sample.price = y_predict
submission = sample
submission.to_csv('submission.csv')
submission.head()