In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import dok_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [None]:
#read_data
members = pd.read_csv('/content/drive/MyDrive/RS_Project/members.csv')
train = pd.read_csv('/content/drive/MyDrive/RS_Project/train.csv')
test = pd.read_csv('/content/drive/MyDrive/RS_Project/test.csv')
songs = pd.read_csv('/content/drive/MyDrive/RS_Project/songs.csv')
song_extra_info = pd.read_csv('/content/drive/MyDrive/RS_Project/song_extra_info.csv')
sample = pd.read_csv('/content/drive/MyDrive/RS_Project/sample_submission.csv')

In [None]:
def preprocess_members(members):
    members['registration_init_time'] = pd.to_datetime(members['registration_init_time'], format='%Y%m%d')
    members['expiration_date'] = pd.to_datetime(members['expiration_date'], format='%Y%m%d')
    members['membership_days'] = (members['expiration_date'] - members['registration_init_time']).dt.days

    members['bd'] = members['bd'].fillna(0)
    members['gender'] = members['gender'].fillna('unknown')

    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoded_features = encoder.fit_transform(members[['city', 'gender', 'registered_via']])

    scaler = MinMaxScaler()
    scaled_features = scaler.fit_transform(members[['bd', 'membership_days']])

    user_features = pd.DataFrame(
        data=pd.concat([pd.DataFrame(encoded_features), pd.DataFrame(scaled_features)], axis=1).values,
        index=members['msno']
    )
    return user_features, members

user_features, members = preprocess_members(members)

In [None]:
def preprocess_train_test(data, members, is_train=True):
    data = data.merge(members, on='msno', how='left')

    data['source_system_tab'] = data['source_system_tab'].fillna('unknown')
    data['source_screen_name'] = data['source_screen_name'].fillna('unknown')
    data['source_type'] = data['source_type'].fillna('unknown')

    categorical_cols = ['source_system_tab', 'source_screen_name', 'source_type']
    for col in categorical_cols:
        data[col] = data[col].astype('category').cat.codes

    if 'song_length' in data.columns:
        data['song_length'] = data['song_length'].fillna(data['song_length'].mean())

    return data

train = preprocess_train_test(train, members, is_train=True)

test = preprocess_train_test(test, members, is_train=False)


In [None]:
def calculate_similarity_features(user_features):
    similarity_matrix = cosine_similarity(user_features)

    similarity_df = pd.DataFrame(
        similarity_matrix,
        index=user_features.index,
        columns=user_features.index
    )

    similarity_features = []
    for user_id in user_features.index:
        similar_users = similarity_df[user_id].sort_values(ascending=False)[1:251]
        similarity_features.append({
            'msno': user_id,
            'similarity_mean': similar_users.mean(),
            'similarity_max': similar_users.max()
        })
    return pd.DataFrame(similarity_features)

similarity_features = calculate_similarity_features(user_features)

In [None]:

train = train.merge(similarity_features, on='msno', how='left')
test = test.merge(similarity_features, on='msno', how='left')

In [None]:

X = train.drop(columns=['target'])
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.5, random_state=42)

non_numeric_columns = X_train.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {non_numeric_columns}")

X_train = X_train.drop(columns=non_numeric_columns, errors='ignore')
X_val = X_val.drop(columns=non_numeric_columns, errors='ignore')

X_train = X_train.drop(columns=['registration_init_time', 'expiration_date'], errors='ignore')
X_val = X_val.drop(columns=['registration_init_time', 'expiration_date'], errors='ignore')

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

val_predictions = model.predict(X_val)

val_accuracy = accuracy_score(y_val, val_predictions)
print(f"Validation Accuracy: {val_accuracy:.2f}")


Non-numeric columns: Index(['msno', 'song_id', 'gender'], dtype='object')
Validation Accuracy: 0.70


In [None]:

test = test[X_train.columns]

test_predictions = model.predict_proba(test)[:, 1]

submission = pd.DataFrame({
    'id': range(len(test)),
    'target': test_predictions
})
submission.to_csv('submission_with_similarity_onlytrain_100.csv', index=False)
print("Submission file saved as submission_with_similarity.csv")
