In [None]:
'''
%pip install --upgrade pip  --quiet
%pip install pandas  --upgrade --quiet
%pip install numpy  --upgrade --quiet
%pip install scipy  --upgrade --quiet
%pip install statsmodels  --upgrade --quiet
%pip install seaborn  --upgrade --quiet
%pip install scikit-learn==1.3.0
%pip install tqdm ipykernel matplotlib ipywidgets --upgrade --quiet   
%pip install plotly numpy==1.25 nbformat umap-learn
%pip install ucimlrepo
%pip install mlxtend
%pip install pydotplus
%pip install imbalanced-learn
%pip install yellowbrick
%pip install missingno
%load_ext autoreload
'''

Basic imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['figure.figsize'] = [9, 6]  

plt.rcParams['font.size'] = 14

sns.set(font_scale=1)

# 1. Anàlisi Exploratòria de Dades (EDA)

#### <span style="color:lightgreen"> Carreguem la base de dades</span>

In [None]:
df = pd.read_csv("smartphone_data.csv")

df.shape

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
def classify_features(df, target): 
    initial_numerical_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    initial_categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()

    numerical_features = []
    categorical_features = initial_categorical_features  

    for column in initial_numerical_features:
        if column != target:
            if df[column].nunique() < 10:
                categorical_features.append(column) 
            else:
                numerical_features.append(column)  

    def feature_type(column):
        if column in numerical_features:
            return 'Numerical'
        elif column in categorical_features:
            return 'Categorical'
        else:
            return 'Boolean'

    features = pd.DataFrame({
        'Feature': [column for column in df.columns if column != 'price'],
        'Type': [df[column].dtype for column in df.columns if column != 'price'],
        'Unique values': [df[column].nunique() for column in df.columns if column != 'price'],
        'Category': [feature_type(column) for column in df.columns if column != 'price']
    })

    features.sort_values(by='Unique values', ascending=True, inplace=True)
    return numerical_features, categorical_features, features

numerical_features, categorical_features, features = classify_features(df, 'price')

features

In [None]:
for variable in ['has_5g', 'has_nfc', 'has_ir_blaster']:
    df[variable] = df[variable].map({True: 1, False: 0})

In [None]:
numerical_features, categorical_features, features = classify_features(df, 'price')

features

In [None]:
# Eliminar 'model' de la llista de variables categòriques i de 'df'
categorical_features.remove('model')
df.drop('model', axis=1, inplace=True)

#### <span style="color:lightgreen"> Visualitzem la distribució de cada variable numèrica</span>

In [None]:
'''for feature in numerical_features:
    mean = df[feature].mean()
    fig, ax = plt.subplots()
    sns.histplot(df[feature], kde=False, ax=ax, edgecolor="black")
    ax.plot([mean], [-0.6], marker='^', markersize=9, color="red")
    ax.set_title(f'Distribució de {feature}')
    ax.set_xlabel(feature, size=10)
    ax.set_ylabel("Freqüència", size=10)
    plt.tight_layout()
    #plt.savefig(f'./plots/distribution_num/{feature}_distribution.png')'''

#### <span style="color:lightgreen"> Histograma de la freqüència per classe de cada variable categòrica </span>

In [None]:
'''for feature in categorical_features:
    plt.figure()
    df[feature].value_counts().plot(kind='bar', color='green')
    plt.title(f'Freqüència per classe del feature {feature}')
    if feature == 'resolution':
        plt.xticks(rotation=90, fontsize=8)  
    else:
        plt.xticks(rotation=70, fontsize=10)
    plt.tight_layout()
    #plt.savefig(f'./plots/frequency_cat/{feature}_frequency.png')'''

#### <span style="color:lightgreen"> Correlacions entre variables numèriques </span>

In [None]:
corr_matrix = df[numerical_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True)
#plt.savefig('./plots/correlations_heatmap.png')
plt.show()

#### <span style="color:lightgreen"> Correlació entre variables categòriques i variable objectiu </span>

In [None]:
for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    sns.boxplot(x=feature, y='price', data=df)
    plt.title(f'Distribución de price según {feature}')
    plt.xticks(rotation=45)
    plt.show()

#### <span style="color:lightgreen"> Correlació entre variables numèriques i variable objectiu </span>

In [None]:
sns.set(style="whitegrid")

for feature in numerical_features:
    plt.figure(figsize=(8, 6))  # Ajusta el tamaño de la figura
    sns.regplot(x=feature, y='price', data=df, scatter_kws={'alpha':0.5}, line_kws={"color": "red"})  # alpha para transparencia de puntos
    plt.title(f'Relación entre Price y {feature} con línea de regresión')  # Título del gráfico
    plt.xlabel(feature)  # Etiqueta del eje x
    plt.ylabel('Price')  # Etiqueta del eje y
    plt.show()

# 2. Preprocessament

#### <span style="color:lightgreen"> Missings </span>

In [None]:
import missingno as msno

msno.matrix(df)

In [None]:
def missing_data(data):
    total_missing = data.isna().sum().sort_values(ascending=False)
    percent_missing = round(100 * (data.isnull().sum() / len(data)), 2).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Total Missing': total_missing, 'Percent Missing (%)': percent_missing})
    return missing_data
missing_data(df)

Eliminem 'extended_upto' ja que té gairebé un 50% de missings

In [None]:
categorical_features.remove('extended_upto')
df = df.drop('extended_upto', axis=1)

Els altres missings els imputarem un cop particionem el dataset en train i test

#### <span style="color:lightgreen"> Outliers </span>

In [None]:
for feature in numerical_features:
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

    # Plot boxplot
    sns.boxplot(x=df[feature], ax=axes[0])
    axes[0].set_title(f'{feature} with outliers')

    # Plot distribution
    sns.histplot(data=df, x=feature, kde=True, ax=axes[1])
    axes[1].set_title(f'{feature} Distribution')

    plt.tight_layout()
    #fig.savefig(f'./plots/dist_with_outliers/{feature}_with_outliers.png')

Eliminarem els outliers seguint el criteri del Rang Interquartil, excepte la variable objectiu 'price'

Per tant, considerarem outliers:
* Els valors més grans que Q1 - 1.5*IQR
* Els valors més petits que Q3 + 1.5*IQR


In [None]:
for feature in numerical_features:
    # Calcular IQR
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1

    # Definir el límit inferior i superior
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Replace outliers with NaN
    df.loc[(df[feature] < lower_bound) | (df[feature] > upper_bound), feature] = np.nan

    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

    # Plot boxplot
    sns.boxplot(x=df[feature], ax=axes[0])
    axes[0].set_title(f'{feature} Boxplot')

    # Plot distribution
    sns.histplot(data=df, x=feature, kde=True, ax=axes[1])
    axes[1].set_title(f'{feature} Distribution')

    plt.tight_layout()
    #fig.savefig(f'./plots/dist_without_outliers/{feature}_without_outliers.png')

#### <span style="color:lightgreen"> Recodificació de variables categòriques </span>

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_encode(df, categorical_columns):
    encoded_df = df.copy()

    for col in categorical_columns:
        le = LabelEncoder()
        encoded_df[col] = le.fit_transform(encoded_df[col].astype(str))

    return encoded_df

In [None]:
df_encoded = label_encode(df, categorical_features)

df_encoded[categorical_features].describe()

#### <span style="color:lightgreen"> Normalització de variables numèriques </span>

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df[numerical_features]), columns=numerical_features)

for feature in numerical_features:
    plt.figure(figsize=(10, 4))
    df_normalized[feature].plot.hist()
    plt.title(f'Distribució Normalitzada - {feature}')
    #plt.savefig(f'./plots/normalized/{feature}_normalized.png')

In [None]:
df_normalized[numerical_features].sample(10, random_state=25) 

# 3. Remostreig

#### <span style="color:lightgreen"> Partició del dataset en Train i Test </span>

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('price', axis=1) 
y = df['price']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)

In [None]:
# Create a table with the shape of the train and test sets
sets_df = pd.DataFrame(columns=['Set', 'Number of Observations', 'Number of Features'])
sets_df.loc[len(sets_df)] = ['Train', X_train.shape[0], X_train.shape[1]]
sets_df.loc[len(sets_df)] = ['Test', X_test.shape[0], X_test.shape[1]]

sets_df

#### <span style="color:lightgreen"> Imputació de Missings</span>

##### Variables numèriques

In [None]:
missing_before_num = X_train[numerical_features].isnull().sum()
mean_before = X_train[numerical_features].mean()
stderr_before = X_train[numerical_features].sem()
median_before = X_train[numerical_features].median()

In [None]:
from sklearn.impute import KNNImputer

# Crear l'imputador KNN
imputer = KNNImputer(n_neighbors=5)

# Ajustar l'imputador a les característiques numèriques de les dades d'entrenament i transformar-les
X_train[numerical_features] = imputer.fit_transform(X_train[numerical_features])

# Transformar les característiques numèriques de les dades de prova utilitzant l'imputador ajustat
X_test[numerical_features] = imputer.transform(X_test[numerical_features])

In [None]:
missing_after_num = X_train[numerical_features].isnull().sum()
mean_after = X_train[numerical_features].mean()
stderr_after = X_train[numerical_features].sem()
median_after = X_train[numerical_features].median()

In [None]:
stats_comparison_num = pd.DataFrame({
    'Feature': missing_before_num.index, 
    'Mean (old)': mean_before.values,
    'Mean': mean_after.values,
    'Std_Error (old)': stderr_before.values,
    'Std_Error': stderr_after.values,
    'Median (old)': median_before.values,
    'Median': median_after.values
})
stats_comparison_num

##### Variables categòriques

In [None]:
# Save the original data
original_data = X_train.copy()

In [None]:
# List containing the names of categorical features that have at least one missing value
cat_features_missings = [feature for feature in categorical_features if X_train[feature].isnull().any()]
print(cat_features_missings)

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='most_frequent')

X_train[categorical_features] = imputer.fit_transform(X_train[categorical_features])
X_test[categorical_features] = imputer.transform(X_test[categorical_features])

In [None]:
for feature in cat_features_missings:
    plt.figure(figsize=(10, 4))

    # Before imputation plot (excluding 'NaN' values)
    plt.subplot(1, 2, 1)
    # Filter out the NaN values
    filtered_data = original_data[original_data[feature].notna()]
    sns.countplot(data=filtered_data, x=feature, color='green')
    plt.title(f'{feature} - Original')
    plt.xticks(rotation=60)  

    # After imputation plot
    plt.subplot(1, 2, 2)
    sns.countplot(data=X_train, x=feature, color='green')
    plt.title(f'{feature} - Després de la Imputació')
    plt.xticks(rotation=60)  

    #plt.savefig(f'./plots/dist_moda/{feature}_moda.png')

#### <span style="color:lightgreen"> CV per avaluar el model </span>

In [None]:
# No entenc molt bé que hem de fer aquí

# 4. Model Lineal Base

#### <span style="color:lightgreen"> Entrenament i avaluació d'un model de regressió lineal </span>

#### <span style="color:lightgreen"> Interpretació dels resultats obtinguts (mètriques de classificació, coeficients, etc.) </span>