In [1]:
# All imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge, Lasso
from modules import *

In [4]:
def main():

    filepath = './Data/OnlineNewsPopularity.csv'
    
    print("*************** Loading the dataset ***************")
    df = load_data(filepath)
    
    print("*************** Preprocessing the data ***************")
    # Calculate the 95th percentile for the 'shares' column
    percentile_95 = np.percentile(df['shares'], 95)
    
    # Display the original and capped shares statistics for comparison
    original_stats = df['shares'].describe()
    
    # Cap the 'shares' column at the 95th percentile
    df['shares'] = df['shares'].clip(upper=percentile_95)
    
    capped_stats = df['shares'].describe()
    print("Capping the value of shares to 95th percentile\n Comparison of original stats for shares to capped value stats\n")
    original_stats, capped_stats
    
    print("*************** Performing Feature Selection ***************")
    
    # List of features to drop
    features_to_drop = [
        'n_non_stop_words',
        'n_non_stop_unique_tokens',
        'self_reference_max_shares',
        'self_reference_min_shares',
        'kw_max_min',
        'kw_min_min',
        'kw_min_max',
        'kw_max_avg',
        'kw_min_avg',
        'min_negative_polarity',
        'max_positive_polarity',
        'rate_negative_words',
        'rate_positive_words',
        'title_subjectivity',
        'weekday_is_sunday',
        'weekday_is_saturday',
        'global_rate_negative_words',
        'global_rate_positive_words',
        'rate_negative_words',
        'rate_positive_words',
        'n_unique_tokens',
        'average_token_length'
    ]
    
    df = drop_features(df, features_to_drop)
    print(f"Final selected features are : \n {df.columns}")

    print("\n\n\n*************** Model Training and Evaluation ***************")

    final_model, model_metrics = create_model(df)

    print("\n*************** Processing Complete ***************\n")


In [3]:
if __name__== "__main__":
    main()

*************** Loading the dataset ***************

Shape of the dataset: (39644, 61)

Data Types and Non-Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39644 entries, 0 to 39643
Data columns (total 61 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   url                            39644 non-null  object 
 1   timedelta                      39644 non-null  float64
 2   n_tokens_title                 39644 non-null  float64
 3   n_tokens_content               39644 non-null  float64
 4   n_unique_tokens                39644 non-null  float64
 5   n_non_stop_words               39644 non-null  float64
 6   n_non_stop_unique_tokens       39644 non-null  float64
 7   num_hrefs                      39644 non-null  float64
 8   num_self_hrefs                 39644 non-null  float64
 9   num_imgs                       39644 non-null  float64
 10  num_videos                     39644 non-null 