In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import numpy as n


# Convert size strings like '10M', '500K' to float in MB
def convert_size(value):
    if pd.isna(value):
        return pd.NA
    value = str(value)
    if 'M' in value:
        return float(value.replace('M', ''))
    elif 'K' in value or 'k' in value:
        return float(value.replace('K', '').replace('k', '')) / 1024
    return pd.NA
    
def parse_date(value):
    try:
        # Convert to datetime, handling common date formats
        return pd.to_datetime(value, errors='coerce', format='%B %d, %Y')
    except Exception as e:
        return pd.NaT  # Return Not a Time for invalid dates

# Extract float version from a string like '4.1 and up'
def preprocess_X11(value):
    if pd.isna(value) or 'Varies' in str(value):
        return pd.NA
    try:
        return float(str(value).split()[0])
    except ValueError:
        return pd.NA
        
def label(value, label_mapping={}):
    if value not in label_mapping:
        label_mapping[value] = len(label_mapping)  # Assign the next integer
    return label_mapping[value]
        
def preprocess_train_data(train_path):
    df = pd.read_csv(train_path)
    df_clean = df.copy()
    df_clean = df_clean[['X3','X4','X5','X9','X11','Y']] 
    
    #df_clean['Y'] = df_clean['Y'].fillna(df_clean['Y'].median())
    df_clean = df_clean.dropna(subset=['Y'])
    df_clean = df_clean[(df_clean['Y'] >= 1) & (df_clean['Y'] <= 5)]
    
    # Preprocess X1 (Categories)
    # label_mappingX1 = {}
    # df_clean['X1'] = df_clean['X1'].apply(lambda x: label(x, label_mappingX1))
    
    # Preprocess X2 (Reviews)
    # df_clean['X2'] = df_clean['X2'].astype(str).str.replace(',', '', regex=False)
    # df_clean['X2'] = pd.to_numeric(df_clean['X2'], errors='coerce')

    # Process X3 (Size)
    df_clean['X3'] = df_clean['X3'].replace('Varies with device', pd.NA)
    df_clean['X3'] = df_clean['X3'].apply(convert_size)
    df_clean['X3'] = pd.to_numeric(df_clean['X3'], errors='coerce')

    # Process X4 (Downloads)
    df_clean['X4'] = df_clean['X4'].apply(lambda x: x.strip('+').replace(',', ''))
    df_clean['X4'] = pd.to_numeric(df_clean['X4'], errors='coerce')


    # Process X5 (Type: Free=0, Paid=1)
    df_clean['X5'] = df_clean['X5'].map({'Free': 0, 'Paid': 1})

    # # Preprocess X7 (Age groups)
    # label_mappingX7 = {}
    # df_clean['X7'] = df_clean['X7'].apply(lambda x: label(x, label_mappingX7))

    
    # Preprocess X9 (Date)
    df_clean['X9'] = df_clean['X9'].apply(parse_date)
    today_date = datetime(2025, 5, 9)
    # Calculate the number of days since today (09/05/2025)
    df_clean['X9'] = (today_date - df_clean['X9']).dt.days

    
    # Process X11 (Android Version)
    df_clean['X11'] = df_clean['X11'].apply(preprocess_X11)
    df_clean['X11'] = pd.to_numeric(df_clean['X11'], errors='coerce')

     # === Impute X3 using Linear Regression on ['X1','X5','X7','X9'] ===
    mask_x3 = df_clean['X3'].isna()
    if mask_x3.any():
        lr_x3 = LinearRegression()
        known = ~mask_x3
        lr_x3.fit(df_clean.loc[known, ['X4','X5','X9']],
                  df_clean.loc[known, 'X3'])
        df_clean.loc[mask_x3, 'X3'] = lr_x3.predict(df_clean.loc[mask_x3, ['X4','X5','X9']])

    
    # === Impute X11 using Linear Regression on ['X1','X3','X5','X7','X9'] ===
    mask_x11 = df_clean['X11'].isna()
    if mask_x11.any():
        lr_x11 = LinearRegression()
        known11 = ~mask_x11
        lr_x11.fit(df_clean.loc[known11, ['X4','X3','X5','X9']],
                   df_clean.loc[known11, 'X11'])
        df_clean.loc[mask_x11, 'X11'] = lr_x11.predict(df_clean.loc[mask_x11, ['X4','X3','X5','X9']])

    return df_clean, (lr_x3, lr_x11) ,df_clean.columns.drop('Y').tolist()


# Preprocess test data
def preprocess_test_data(test_path, lr_models, feature_columns):
    lr_x3, lr_x11 = lr_models
    df_test = pd.read_csv(test_path)
    
    if 'row_id' in df_test.columns:
        test_ids = df_test['row_id']
    elif 'ID' in df_test.columns:
        test_ids = df_test['ID']
    else:
        test_ids = df_test.index

    df_clean = df_test[[ 'X3', 'X4', 'X5','X9','X11']].copy()  
    
    # Preprocess X1 (Categories)
    # label_mappingX1 = {}
    # df_clean['X1'] = df_clean['X1'].apply(lambda x: label(x, label_mappingX1))
    
    # Preprocess X2 (Reviews)
    # df_clean['X2'] = df_clean['X2'].astype(str).str.replace(',', '', regex=False)
    # df_clean['X2'] = pd.to_numeric(df_clean['X2'], errors='coerce')

    # Process X3 (Size)
    df_clean['X3'] = df_clean['X3'].replace('Varies with device', pd.NA)
    df_clean['X3'] = df_clean['X3'].apply(convert_size)
    df_clean['X3'] = pd.to_numeric(df_clean['X3'], errors='coerce')

    # Process X4 (Downloads)
    df_clean['X4'] = df_clean['X4'].apply(lambda x: x.strip('+').replace(',', ''))
    df_clean['X4'] = pd.to_numeric(df_clean['X4'], errors='coerce')


    # Process X5 (Type: Free=0, Paid=1)
    df_clean['X5'] = df_clean['X5'].map({'Free': 0, 'Paid': 1})

    # Preprocess X7 (Age groups)
    # label_mappingX7 = {}
    # df_clean['X7'] = df_clean['X7'].apply(lambda x: label(x, label_mappingX7))

    
    # Preprocess X9 (Date)
    df_clean['X9'] = df_clean['X9'].apply(parse_date)
    today_date = datetime(2025, 5, 9)
    # Calculate the number of days since today (09/05/2025)
    df_clean['X9'] = (today_date - df_clean['X9']).dt.days

    
    # Process X11 (Android Version)
    df_clean['X11'] = df_clean['X11'].apply(preprocess_X11)
    df_clean['X11'] = pd.to_numeric(df_clean['X11'], errors='coerce')
 
    mask_x3 = df_clean['X3'].isna()
    if mask_x3.any():
        # reuse lr_x3 trained on train data
        df_clean.loc[mask_x3, 'X3'] = lr_x3.predict(df_clean.loc[mask_x3, ['X4','X5','X9']])
    
    # Impute X11
    mask_x11 = df_clean['X11'].isna()
    if mask_x11.any():
        df_clean.loc[mask_x11, 'X11'] = lr_x11.predict(df_clean.loc[mask_x11, ['X4','X3','X5','X9']])
    
   
    for col in feature_columns:
        if col not in df_clean.columns:
            df_clean[col] = 0
    df_clean = df_clean[feature_columns]

    return df_clean, test_ids

def train_on_full_data(X, y, k=5):
    # Initialize k-fold
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    r2_scores = []
    mae_scores = []
    rmse_scores = []

    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        print(f"\nFold {fold + 1}/{k}")
        
        # Split the data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Scale the features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train the model
        model = RandomForestRegressor(n_estimators=200, random_state=42, min_samples_leaf=10,max_features=3,max_depth=6)
        model.fit(X_train_scaled, y_train)
        predictions = model.predict(X_test_scaled)
        
        # Evaluate the model
        r2 = r2_score(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        rmse = mean_squared_error(y_test, predictions, squared=False)
        
        print(f"R²: {r2:.4f}, MAE: {mae:.4f}, RMSE: {rmse:.4f}")
        
        # Store the scores
        r2_scores.append(r2)
        mae_scores.append(mae)
        rmse_scores.append(rmse)
    
    # Print the average scores
    print("\nOverall Performance Across Folds:")
    print(f"Mean R²: {np.mean(r2_scores):.4f}")
    print(f"Mean MAE: {np.mean(mae_scores):.4f}")
    print(f"Mean RMSE: {np.mean(rmse_scores):.4f}")
    
    return model, scaler



def generate_submission(model, scaler, test_data, test_ids, output_path, sample_submission_path):
    test_data_scaled = scaler.transform(test_data)
    predictions = model.predict(test_data_scaled)
    predictions = predictions.round(2)

    sample_submission = pd.read_csv(sample_submission_path)
    submission = pd.DataFrame({
        'row_id': sample_submission['row_id'],
        'Y': predictions
    })

    submission.to_csv(output_path, index=False)
    print(f"Submission saved to {output_path}")

if __name__ == "__main__":
    train_path = "/kaggle/input/app-rating-competition/train.csv"
    test_path = "/kaggle/input/app-rating-competition/test.csv"
    sample_submission_path = "/kaggle/input/app-rating-competition/SampleSubmission.csv"
    submission_path = "/kaggle/working/submission.csv"

    train_data, (lr_x3, lr_x11), feature_columns = preprocess_train_data(train_path)

    print("\nCorrelation matrix:")
    print(train_data.corr()['Y'].sort_values(ascending=False))

    X = train_data.drop('Y', axis=1)
    y = train_data['Y']

    model, scaler = train_on_full_data(X, y)

    test_data, test_ids = preprocess_test_data(test_path, (lr_x3, lr_x11), feature_columns)
    generate_submission(model, scaler, test_data, test_ids, submission_path, sample_submission_path)