In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, Normalizer
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import cohen_kappa_score, confusion_matrix,ConfusionMatrixDisplay
from xgboost import XGBRegressor

# !pip install pgeocode

# import pgeocode

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s3e6/train.csv', dtype={'cityCode': object})
test = pd.read_csv('/kaggle/input/playground-series-s3e6/test.csv', dtype={'cityCode': object})
original = pd.read_csv('/kaggle/input/paris-housing-price-prediction/ParisHousing.csv', dtype={'cityCode': object})
original = original.reset_index()
original['id'] = original['index'] + 1000000
original = original.drop(columns = ['index'])
train.head()

In [None]:
original.head()

In [None]:
train['cityCode'] = train['cityCode'].str.zfill(5)
test['cityCode'] = train['cityCode'].str.zfill(5)
original['cityCode'] = train['cityCode'].str.zfill(5)

In [None]:
print(len(train))
print(len(test))
print(len(original))

In [None]:
num_cols = train.select_dtypes(include=np.number).columns.tolist()
num_cols.remove('id')
num_cols.remove('price')

In [None]:
ncols = 3
nrows = int(np.ceil(len(num_cols)/ncols))
fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 15))
for ax, f in zip(axes.flat, num_cols):
    sns.kdeplot(train[f], color='r', label='train', ax=ax)
    sns.kdeplot(test[f], color='b', label='test', ax=ax)
    sns.kdeplot(original[f], color='g', label='original', ax=ax)
    ax.set_title(f)
    ax.legend()
plt.tight_layout()
plt.show()

In [None]:
sns.boxplot(y = 'price',data = train)

In [None]:
# eiffel_zip = '75007'
# dist = pgeocode.GeoDistance('fr')
# train['Dist_From_Eiffel'] = train['cityCode'].apply(lambda x: dist.query_postal_code(eiffel_zip , x))

# nomi = pgeocode.Nominatim('fr')
# cols = ['place_name','state_name','state_code','latitude','longitude']

# train[cols] = train.apply(lambda row: nomi.query_postal_code(row['cityCode'])[cols], axis=1, result_type='expand')
# train.head()

# original[cols] = original.apply(lambda row: nomi.query_postal_code(row['cityCode'])[cols], axis=1, result_type='expand')
# original.head()

In [None]:
corr_cols = num_cols + ['price']
plt.figure(figsize=(15,15))
sns.heatmap(train[corr_cols].corr(),annot=True)
# plt.savefig("Heatmap.png")
plt.show()

In [None]:
made = pd.DataFrame(train.groupby(['made']).size()).reset_index().rename(columns = {0:'Count'})
made = made[made['made']< 5000]

made1 = pd.DataFrame(train.groupby(['made'])['price'].mean()).reset_index().rename(columns = {0:'mean_price'})

made = pd.merge(made,made1, on = 'made', how = 'inner')

ax = made.plot(x="made", y="Count", legend=False)
ax2 = ax.twinx()
made.plot(x="made", y="price", ax=ax2, legend=False, color="r")
ax.figure.legend()
plt.show()

In [None]:
made[made['made']<= 2000].corr()

In [None]:
made[made['made']> 2007].corr()

In [None]:
train = pd.concat([train,original])

In [None]:
train_1 = train[train['made']<=2000]
test_1 = test[test['made']<=2000]
train_2 = train[(train['made']>=2001) & (train['made']<=2007)]
test_2 = test[(test['made']>=2001) & (test['made']<=2007)]
train_3 = train[train['made']>2007]
test_3 = test[test['made']>2007]

In [None]:
train_1 = pd.merge(train_1,made[['made','Count']], on = 'made', how = 'inner')
test_1 = pd.merge(test_1,made[['made','Count']], on = 'made', how = 'inner')
plt.figure(figsize=(15,15))
sns.heatmap(train_1.corr(),annot=True)
# plt.savefig("Heatmap.png")
plt.show()

In [None]:
train_2 = pd.merge(train_2,made[['made','Count']], on = 'made', how = 'inner')
test_2 = pd.merge(test_2,made[['made','Count']], on = 'made', how = 'inner')
plt.figure(figsize=(15,15))
sns.heatmap(train_2.corr(),annot=True)
# plt.savefig("Heatmap.png")
plt.show()

In [None]:
train_3 = pd.merge(train_3,made[['made','Count']], on = 'made', how = 'inner')
test_3 = pd.merge(test_3,made[['made','Count']], on = 'made', how = 'inner')
plt.figure(figsize=(15,15))
sns.heatmap(train_3.corr(),annot=True)
# plt.savefig("Heatmap.png")
plt.show()

In [None]:
num_cols = num_cols + ['Count']

In [None]:
X1 = train_1[num_cols]
# X1 = train_1[['squareMeters']]
y1 = train_1['price']

model=XGBRegressor(max_depth=3, learning_rate=0.24 , n_estimators=2000, objective='reg:linear', booster='gbtree')
XGB=model.fit(X1,y1)


X_test = test_1[num_cols]
# X_test = test_1[['squareMeters']]
prediction=XGB.predict(X_test)

test_1['price'] = prediction
test_1

In [None]:
X2 = train_2[num_cols]
# X2 = train_2[['squareMeters']]
y2 = train_2['price']

model=XGBRegressor(max_depth=3, learning_rate=0.24 , n_estimators=2000, objective='reg:linear', booster='gbtree')
XGB=model.fit(X2,y2)

X_test = test_2[num_cols]
# X_test = test_2[['squareMeters']]
prediction=XGB.predict(X_test)

test_2['price'] = prediction
test_2

In [None]:
X3 = train_3[num_cols]
y3 = train_3['price']

model=XGBRegressor(max_depth=3, learning_rate=0.24 , n_estimators=2000, objective='reg:linear', booster='gbtree')
XGB=model.fit(X3,y3)

X_test = test_3[num_cols]
prediction=XGB.predict(X_test)

test_3['price'] = prediction
test_3

In [None]:
submission = pd.concat([test_1,test_2,test_3])
submission = submission[['id','price']]
submission = submission.sort_values(by = ['id'])
submission.to_csv('submission.csv',index = False)
submission.head()