In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import MinMaxScaler

In [2]:
from scipy.stats import skew

def find_skewed_columns(dataframe, threshold=0.5):
    """
    Identify skewed columns in a pandas DataFrame.

    Parameters:
    - dataframe: pandas DataFrame
    - threshold: Skewness threshold to determine if a column is skewed (default is 0.5)

    Returns:
    - List of tuples containing (column_name, skewness_value) for skewed columns.
    """
    skewed_columns = []
    
    for column in dataframe.columns:
        if dataframe[column].dtype in ['int64', 'float64']:
            skewness = skew(dataframe[column])
            if abs(skewness) > threshold:
                skewed_columns.append((column, skewness))
    
    return skewed_columns

def check_scaling_needed(dataframe, threshold=5):
    """
    Identify columns in a pandas DataFrame that may require scaling.

    Parameters:
    - dataframe: pandas DataFrame
    - threshold: Range threshold to determine if a column requires scaling (default is 5)

    Returns:
    - List of tuples containing (column_name, range_value) for columns that may require scaling.
    """
    scaling_needed_columns = []

    for column in dataframe.columns:
        if dataframe[column].dtype in ['int64', 'float64']:
            column_range = dataframe[column].max() - dataframe[column].min()
            if column_range > threshold:
                scaling_needed_columns.append((column, column_range))

    return scaling_needed_columns

In [3]:
train = pd.read_csv('../../2nd-Comp-Data/train.csv')
test = pd.read_csv('../../2nd-Comp-Data/test.csv')
testOriginal = pd.read_csv('../../2nd-Comp-Data/test.csv')

In [4]:
train.shape

(181507, 272)

In [5]:
test.drop('row ID', axis=1, inplace=True)
test.drop('sub_area', axis=1, inplace=True)

In [6]:
train.drop('sub_area', axis=1, inplace=True)

Categorical To Numerical

In [7]:
train = pd.get_dummies(train)
test = pd.get_dummies(test)

Scaling

In [8]:
# scaling_needed_columns = check_scaling_needed(train)

# if not scaling_needed_columns:
#     print("No columns require scaling.")
# else:
#     print("Columns that may require scaling:")
#     for column, column_range in scaling_needed_columns:
#         print(f"{column}: Range = {column_range}")


In [9]:
# scaling_needed_columns = check_scaling_needed(train)
# scaler = MinMaxScaler()

# for column, _ in scaling_needed_columns:
#     if train[column].dtype in ['int64', 'float64']:
#         train[column] = scaler.fit_transform(train[[column]])

Normalization

In [10]:
skewed_columns = find_skewed_columns(train)

if not skewed_columns:
    print("No skewed columns found.")
else:
    print("Skewed columns:")
    for column, skewness in skewed_columns:
        print(f"{column}: Skewness = {skewness}")

Skewed columns:
full_sq: Skewness = 2.9087842962497996
life_sq: Skewness = 3.009192993375616
floor: Skewness = 2.688649903999332
area_m: Skewness = 2.2060704026588662
raion_popul: Skewness = 0.5396642095318434
green_zone_part: Skewness = 0.9783776212652736
indust_part: Skewness = 0.9075324786679504
children_preschool: Skewness = 1.2095665329895529
preschool_education_centers_raion: Skewness = 0.597380337910174
children_school: Skewness = 1.0794304517714368
school_education_centers_top_20_raion: Skewness = 1.9556211831048462
healthcare_centers_raion: Skewness = 0.7619618969715417
university_top_20_raion: Skewness = 2.0220915540187976
sport_objects_raion: Skewness = 1.1080408409523737
additional_education_raion: Skewness = 1.8179306260988264
culture_objects_top_25_raion: Skewness = 2.6715541515113923
shopping_centers_raion: Skewness = 1.4692835773203703
office_raion: Skewness = 2.210852135036375
full_all: Skewness = 2.1505200560280215
male_f: Skewness = 2.144854488783137
female_f: Skewne

In [11]:
for column, _ in skewed_columns:
    if train[column].dtype in ['int64', 'float64']:
        train[column] = train[column].apply(lambda x: 1 if x == 0 else np.log(x))

In [12]:
X = train.loc[:, train.columns != 'price_doc']
y = train[['price_doc']]

In [13]:
# ols_reg = LinearRegression()
# sfs = SequentialFeatureSelector(ols_reg, direction='forward',n_features_to_select=5)
# sfs.fit(X, y)
# print(sfs.get_feature_names_out())

In [14]:
# X = X[['full_sq', 'mosque_count_500', 'leisure_count_500', 'cafe_count_1000_price_high', 'leisure_count_1000']]
# test = test[['full_sq', 'mosque_count_500', 'leisure_count_500', 'cafe_count_1000_price_high', 'leisure_count_1000']]

In [15]:
X.shape

(181507, 287)

In [16]:
test.shape

(77789, 287)

In [17]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [18]:
# reg2 = LinearRegression().fit(X_train, y_train)
# y_pred = reg2.predict(X_test)
# print("LR: R2 = %.4f and MSE = %.2f" % (reg2.score(X_test,y_test), mean_squared_error(y_test, y_pred)))

In [19]:
# reg2 = LinearRegression().fit(X_train, y_train)
# y_pred = reg2.predict(X_test)
# print("LR: R2 = %.4f and MSE = %.2f" % (reg2.score(X_test,y_test), mean_squared_error(y_test, y_pred)))

In [20]:
reg2 = LinearRegression().fit(X, y)

# Make predictions
y_pred = reg2.predict(test)

In [None]:
result_df = pd.DataFrame({'row ID': testOriginal['row ID'], 'price_doc': y_pred.flatten()})
result_df.to_csv('Day2.2.csv', index=False)