In [13]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler

In [14]:
from scipy.stats import skew

def find_skewed_columns(dataframe, threshold=0.5):
    skewed_columns = []
    
    for column in dataframe.columns:
        if dataframe[column].dtype in ['int64', 'float64']:
            skewness = skew(dataframe[column])
            if abs(skewness) > threshold:
                skewed_columns.append((column, skewness))
    
    return skewed_columns

def check_scaling_needed(dataframe, threshold=5):
    scaling_needed_columns = []

    for column in dataframe.columns:
        if dataframe[column].dtype in ['int64', 'float64']:
            column_range = dataframe[column].max() - dataframe[column].min()
            if column_range > threshold:
                scaling_needed_columns.append((column, column_range))

    return scaling_needed_columns

In [15]:
train = pd.read_csv('../../2nd-Comp-Data/train.csv')
test = pd.read_csv('../../2nd-Comp-Data/test.csv')
testOriginal = pd.read_csv('../../2nd-Comp-Data/test.csv')

In [16]:
train.shape

(181507, 272)

In [17]:
test.drop('row ID', axis=1, inplace=True)
test.drop('sub_area', axis=1, inplace=True)

In [18]:
train.drop('sub_area', axis=1, inplace=True)

<h1>Categorical To Numerical<h1>

In [19]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

<h1>Scaling<h1>

In [20]:
scaling_needed_columns = check_scaling_needed(train)

if not scaling_needed_columns:
    print("No columns require scaling.")
else:
    print("Columns that may require scaling:")
    for column, column_range in scaling_needed_columns:
        print(f"{column}: Range = {column_range}")


Columns that may require scaling:
full_sq: Range = 5326.0
life_sq: Range = 7478.0
floor: Range = 76.99862964695038
area_m: Range = 203990181.43899998
raion_popul: Range = 244923.0
children_preschool: Range = 19048.0
preschool_education_centers_raion: Range = 13.0
children_school: Range = 18915.0
school_education_centers_raion: Range = 14.0
healthcare_centers_raion: Range = 6.0
sport_objects_raion: Range = 29.0
additional_education_raion: Range = 16.0
culture_objects_top_25_raion: Range = 10.0
shopping_centers_raion: Range = 23.0
office_raion: Range = 141.0
full_all: Range = 1714184.0
male_f: Range = 773377.0
female_f: Range = 940804.0
young_all: Range = 40327.0
young_male: Range = 20788.0
young_female: Range = 19538.0
work_all: Range = 159657.0
work_male: Range = 78759.0
work_female: Range = 80897.0
ekder_all: Range = 56538.0
ekder_male: Range = 19119.0
ekder_female: Range = 37418.0
0_6_all: Range = 19048.0
0_6_male: Range = 9896.0
0_6_female: Range = 9151.0
7_14_all: Range = 18915.0
7

In [21]:
scaling_needed_columns = check_scaling_needed(train)
scaler = MinMaxScaler()

for column, _ in scaling_needed_columns:
    if train[column].dtype in ['int64', 'float64']:
        train[column] = scaler.fit_transform(train[[column]])

<h1>Normalization<h1>

In [22]:
# skewed_columns = find_skewed_columns(train)

# if not skewed_columns:
#     print("No skewed columns found.")
# else:
#     print("Skewed columns:")
#     for column, skewness in skewed_columns:
#         print(f"{column}: Skewness = {skewness}")

In [23]:
# for column, _ in skewed_columns:
#     if train[column].dtype in ['int64', 'float64']:
#         train[column] = train[column].apply(lambda x: 1 if x == 0 else np.log(x))

In [24]:
X = train.loc[:, train.columns != 'price_doc']
y = train[['price_doc']]

In [25]:
# ols_reg = LinearRegression()
# sfs = SequentialFeatureSelector(ols_reg, direction='forward',n_features_to_select=5)
# sfs.fit(X, y)
# print(sfs.get_feature_names_out())

In [26]:
# X = X[['full_sq', 'mosque_count_500', 'leisure_count_500', 'cafe_count_1000_price_high', 'leisure_count_1000']]
# test = test[['full_sq', 'mosque_count_500', 'leisure_count_500', 'cafe_count_1000_price_high', 'leisure_count_1000']]

In [27]:
X.shape

(181507, 287)

In [28]:
test.shape

(77789, 287)

In [29]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [30]:
# reg2 = LinearRegression().fit(X_train, y_train)
# y_pred = reg2.predict(X_test)
# print("LR: R2 = %.4f and MSE = %.2f" % (reg2.score(X_test,y_test), mean_squared_error(y_test, y_pred)))

In [31]:
# reg2 = LinearRegression().fit(X_train, y_train)
# y_pred = reg2.predict(X_test)
# print("LR: R2 = %.4f and MSE = %.2f" % (reg2.score(X_test,y_test), mean_squared_error(y_test, y_pred)))

In [32]:
reg2 = LinearRegression().fit(X, y)

# Make predictions
y_pred = reg2.predict(test)

In [None]:
result_df = pd.DataFrame({'row ID': testOriginal['row ID'], 'price_doc': y_pred.flatten()})
result_df.to_csv('Day2.1.csv', index=False)