In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tools import add_constant
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold

def convert_size(value):
    if pd.isna(value):
        return pd.NA
    value = str(value)
    if 'M' in value:
        return float(value.replace('M', ''))
    elif 'K' in value:
        return float(value.replace('K', '')) / 1024
    else:
        return pd.NA

def preprocess_X11(value):
    if pd.isna(value) or 'Varies' in str(value):
        return pd.NA
    version = str(value).split()[0]
    try:
        return float(version)
    except ValueError:
        return pd.NA

def preprocess_train_data(train_path):
    df = pd.read_csv(train_path)
    df_clean = df.dropna(subset=['Y']).copy()
    df_clean = df_clean[['X3', 'X5', 'X11', 'Y']]
    df_clean = df_clean[(df_clean['Y'] >= 1) & (df_clean['Y'] <= 5)]

    # Convert app size
    df_clean['X3'] = df_clean['X3'].replace('Varies with device', pd.NA)
    df_clean['X3'] = df_clean['X3'].apply(lambda x: convert_size(x) if pd.notna(x) else x)
    df_clean['X3'] = pd.to_numeric(df_clean['X3'], errors='coerce')

    # Convert type
    df_clean['X5'] = df_clean['X5'].map({'Free': 0, 'Paid': 1})

    # Convert Android version
    df_clean['X11'] = df_clean['X11'].apply(preprocess_X11)
    df_clean['X11'] = pd.to_numeric(df_clean['X11'], errors='coerce')

    # Separate imputers for numeric and categorical
    num_imputer = SimpleImputer(strategy='median')
    cat_imputer = SimpleImputer(strategy='most_frequent')
    
    # Impute numeric features
    df_clean[['X3', 'X11']] = num_imputer.fit_transform(df_clean[['X3', 'X11']])
    # Impute categorical feature
    df_clean['X5'] = cat_imputer.fit_transform(df_clean[['X5']])

    return df_clean, (num_imputer, cat_imputer), df_clean.columns.drop('Y').tolist()

def preprocess_test_data(test_path, imputers, feature_columns):
    num_imputer, cat_imputer = imputers
    df_test = pd.read_csv(test_path)

    test_ids = df_test.get('row_id', df_test.index)
    
    df_clean = df_test[['X3', 'X5', 'X11']].copy()

    # Convert app size
    df_clean['X3'] = df_clean['X3'].replace('Varies with device', pd.NA)
    df_clean['X3'] = df_clean['X3'].apply(lambda x: convert_size(str(x)) if pd.notna(x) else x)
    df_clean['X3'] = pd.to_numeric(df_clean['X3'], errors='coerce')

    # Convert type
    df_clean['X5'] = df_clean['X5'].map({'Free': 0, 'Paid': 1})

    # Convert Android version
    df_clean['X11'] = df_clean['X11'].apply(preprocess_X11)
    df_clean['X11'] = pd.to_numeric(df_clean['X11'], errors='coerce')

    # Apply imputers
    df_clean[['X3', 'X11']] = num_imputer.transform(df_clean[['X3', 'X11']])
    df_clean['X5'] = cat_imputer.transform(df_clean[['X5']])

    # Ensure feature columns exist
    for col in feature_columns:
        if col not in df_clean.columns:
            df_clean[col] = 0

    df_clean = df_clean[feature_columns]
    return df_clean, test_ids

def train_on_full_data(X, y):
    # Define categorical features
    cat_features = X.select_dtypes('category').columns.tolist()
    
    # Initialize and train CatBoost
    model = CatBoostRegressor(
        iterations=500,
        learning_rate=0.000455,
        depth=5,
        cat_features=cat_features,
        random_seed=42,
        verbose=0
    )
    model.fit(X, y)
    
    # Evaluate
    predictions = model.predict(X)
    print("\nCatBoost Training Performance:")
    print(f"R²: {r2_score(y, predictions):.4f}")
    print(f"MAE: {mean_absolute_error(y, predictions):.4f}")
    print(f"RMSE: {mean_squared_error(y, predictions, squared=False):.4f}")
    
    return model

def analyze_features(X, y, model=None):
    # Feature correlation analysis
    plt.figure(figsize=(12, 8))
    corr_matrix = X.join(y).corr()
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
    plt.title("Feature Correlation Matrix")
    plt.show()

    # CatBoost feature importance
    if model:
        importance = model.get_feature_importance()
        importance_df = pd.DataFrame({
            'Feature': X.columns,
            'Importance': importance
        }).sort_values('Importance', ascending=False)
        
        plt.figure(figsize=(10, 6))
        sns.barplot(x='Importance', y='Feature', data=importance_df)
        plt.title("CatBoost Feature Importance")
        plt.show()

    # Partial dependence plots (fixed version)
    if model:
        plt.figure(figsize=(15, 10))
        
        # Get numerical and categorical features
        numerical_features = X.select_dtypes(include=np.number).columns
        categorical_features = X.select_dtypes(exclude=np.number).columns
        
        for i, feature in enumerate(X.columns):
            plt.subplot(3, 3, i+1)
            
            # Generate synthetic data with proper values
            if feature in numerical_features:
                values = np.linspace(X[feature].quantile(0.05), 
                         X[feature].quantile(0.95), 100)
            else:
                values = X[feature].unique()
            
            # Create base values for other features
            base_values = {
                col: X[col].median() if col in numerical_features 
                else X[col].mode()[0] 
                for col in X.columns
            }
            
            # Create synthetic dataframe
            synthetic = pd.DataFrame([base_values] * len(values))
            synthetic[feature] = values
            
            # Predict and plot
            preds = model.predict(synthetic)
            sns.lineplot(x=values, y=preds, color='red', label='Partial Dependence')
            sns.scatterplot(x=X[feature], y=y, alpha=0.1, label='Actual Data')
            plt.title(f"Partial Dependence: {feature}")
            plt.legend()
            
        plt.tight_layout()
        plt.show()
        
def generate_submission(model, test_data, output_path, sample_submission_path):
    predictions = model.predict(test_data).round(2)
    sample_sub = pd.read_csv(sample_submission_path)
    submission = sample_sub[['row_id']].assign(Y=predictions)
    submission.to_csv(output_path, index=False)
    print(f"Submission saved to {output_path}")

if __name__ == "__main__":
    # Path configurations
    train_path = "/kaggle/input/app-rating-competition/train.csv"
    test_path = "/kaggle/input/app-rating-competition/test.csv"
    sample_path = "/kaggle/input/app-rating-competition/SampleSubmission.csv"
    output_path = "/kaggle/working/submission.csv"

    # Preprocess data
    train_data, imputers, feature_cols = preprocess_train_data(train_path)

    X = train_data.drop('Y', axis=1)
    y = train_data['Y']

    # Train model
    model = train_on_full_data(X, y)
    
    # Analyze features
    analyze_features(X, y, model=model)

    # Prepare test data
    test_data, test_ids = preprocess_test_data(test_path, imputers, feature_cols)
    
    # Generate submission
    generate_submission(model, test_data, output_path, sample_path)