In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()


In [None]:
train = pd.read_csv('child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('child-mind-institute-problematic-internet-use/test.csv')
train_series_data_stats = pd.read_csv('train_series_data_stats.csv')
test_series_data_stats = pd.read_csv('test_series_data_stats.csv')

# train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
# test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
# train_series_data_stats = pd.read_csv('/kaggle/input/series-data-v2/train_series_data_stats.csv')
# test_series_data_stats = pd.read_csv('/kaggle/input/series-data-v2/test_series_data_stats.csv')



In [3]:
def preprocess_data(train, train_series_data_stats, test, test_series_data_stats):

    train_merged_df = train.merge(train_series_data_stats, how='left', on='id')
    train_merged_df_filtered = train_merged_df[~train_merged_df['sii'].isna()].select_dtypes(include='number')
    train_merged_df_filtered = train_merged_df_filtered.loc[:, ~train_merged_df_filtered.columns.str.contains('PCIAT')]
    test_merged_df = test.merge(test_series_data_stats, how='left', on='id')
    test_merged_df_filtered = test_merged_df.select_dtypes(include='number')

    X_train = train_merged_df_filtered.drop(['sii'], axis=1)
    y_train = train_merged_df_filtered['sii']
    X_test = test_merged_df_filtered

    X_train_imputed = imputer.fit_transform(X_train)
    X_train_scaled = scaler.fit_transform(X_train_imputed)

    X_test_imputed = imputer.transform(X_test)
    X_test_scaled = scaler.transform(X_test_imputed)

    return X_train_scaled, y_train, X_test_scaled

In [4]:
X_train_scaled, y_train, X_test_scaled = preprocess_data(train, train_series_data_stats, test, test_series_data_stats)

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=20)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [6]:
df_pca = pd.DataFrame(X_train_pca, columns=[f'PC{i+1}' for i in range(20)])

In [7]:
# Percentage of variance explained by each component
explained_variance = pca.explained_variance_ratio_
cumulative_variance = explained_variance.cumsum()

print("Explained variance by each of the 10 components:")
for i, variance in enumerate(explained_variance, start=1):
    print(f'PC{i}: {variance:.2%}')

print(f'\nTotal variance explained by the 20 components: {cumulative_variance[-1]:.2%}')


Explained variance by each of the 10 components:
PC1: 16.88%
PC2: 7.74%
PC3: 6.88%
PC4: 4.44%
PC5: 4.28%
PC6: 3.78%
PC7: 3.55%
PC8: 2.90%
PC9: 2.54%
PC10: 2.49%
PC11: 2.19%
PC12: 2.02%
PC13: 2.00%
PC14: 1.86%
PC15: 1.58%
PC16: 1.45%
PC17: 1.43%
PC18: 1.38%
PC19: 1.31%
PC20: 1.23%

Total variance explained by the 20 components: 71.91%


In [21]:
from sklearn.linear_model import LinearRegression

def apply_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    reg = LinearRegression().fit(X_train, y_train)
    test_score = reg.score(X_train, y_train)
    print(f'Test Score: {test_score}')

    y_pred = reg.predict(X_test)
    y_train_pred = reg.predict(X_train)
    y_pred = np.round(y_pred).astype(int)
    y_train_pred = np.round(y_train_pred).astype(int)

    qwk_score_train = cohen_kappa_score(y_train, y_train_pred, weights='quadratic')
    qwk_score_val = cohen_kappa_score(y_test, y_pred, weights='quadratic')
    print("Train QWK Score:", qwk_score_train)
    print("Validation QWK Score:", qwk_score_val)

    return reg

In [22]:
model = apply_model(X_train_pca, y_train)

Test Score: 0.1967141292418062
Train QWK Score: 0.334898324596446
Validation QWK Score: 0.31709807218785657


In [24]:
y_pred = model.predict(X_test_pca)
y_pred = np.round(y_pred).astype(int)


In [25]:
submission = pd.DataFrame({'id': test['id'], 'sii': y_pred})
submission.to_csv('submission.csv')

In [26]:
submission

Unnamed: 0,id,sii
0,00008ff9,0
1,000fd460,0
2,00105258,1
3,00115b9f,0
4,0016bb22,1
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,1
9,0083e397,1
