In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from collections import Counter

In [2]:
online_news_raw_data = os.path.join(".", "Database Files", "OnlineNewsPopularity.csv")

In [3]:
online_news_df = pd.read_csv(online_news_raw_data)
online_news_df.head(10)

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2013/01/07/amazon-instant-...,731.0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,...,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875,593
1,http://mashable.com/2013/01/07/ap-samsung-spon...,731.0,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,...,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0,711
2,http://mashable.com/2013/01/07/apple-40-billio...,731.0,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,...,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0,1500
3,http://mashable.com/2013/01/07/astronaut-notre...,731.0,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,...,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0,1200
4,http://mashable.com/2013/01/07/att-u-verse-apps/,731.0,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,...,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364,505
5,http://mashable.com/2013/01/07/beewi-smart-toys/,731.0,10.0,370.0,0.559889,1.0,0.698198,2.0,2.0,0.0,...,0.136364,0.6,-0.195,-0.4,-0.1,0.642857,0.214286,0.142857,0.214286,855
6,http://mashable.com/2013/01/07/bodymedia-armba...,731.0,8.0,960.0,0.418163,1.0,0.549834,21.0,20.0,20.0,...,0.1,1.0,-0.224479,-0.5,-0.05,0.0,0.0,0.5,0.0,556
7,http://mashable.com/2013/01/07/canon-poweshot-n/,731.0,12.0,989.0,0.433574,1.0,0.572108,20.0,20.0,20.0,...,0.1,1.0,-0.242778,-0.5,-0.05,1.0,0.5,0.5,0.5,891
8,http://mashable.com/2013/01/07/car-of-the-futu...,731.0,11.0,97.0,0.670103,1.0,0.836735,2.0,0.0,0.0,...,0.4,0.8,-0.125,-0.125,-0.125,0.125,0.0,0.375,0.0,3600
9,http://mashable.com/2013/01/07/chuck-hagel-web...,731.0,10.0,231.0,0.636364,1.0,0.797101,4.0,1.0,1.0,...,0.1,0.5,-0.238095,-0.5,-0.1,0.0,0.0,0.5,0.0,710


In [4]:
# Clean data by only keeping columns we want to use
condensed_online_news_df = online_news_df.iloc[:,[3, 50, 53, 60]]
condensed_online_news_df.head(10)

Unnamed: 0,n_tokens_content,avg_positive_polarity,avg_negative_polarity,shares
0,219.0,0.378636,-0.35,593
1,255.0,0.286915,-0.11875,711
2,211.0,0.495833,-0.466667,1500
3,531.0,0.385965,-0.369697,1200
4,1072.0,0.411127,-0.220192,505
5,370.0,0.35061,-0.195,855
6,960.0,0.402039,-0.224479,556
7,989.0,0.42772,-0.242778,891
8,97.0,0.566667,-0.125,3600
9,231.0,0.298413,-0.238095,710


In [5]:
# Bucket 'shares' column into categories so we can measure popularity
bins = [0, 500, 1000, 100000, 1000000]
labels = ["Less Shareable", "Somewhat Shareable", "Highly Shareable", "Extremely Shareable"]
condensed_online_news_df["Popularity"] = pd.cut(condensed_online_news_df[" shares"], bins, labels=labels)
condensed_online_news_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,n_tokens_content,avg_positive_polarity,avg_negative_polarity,shares,Popularity
0,219.0,0.378636,-0.35,593,Somewhat Shareable
1,255.0,0.286915,-0.11875,711,Somewhat Shareable
2,211.0,0.495833,-0.466667,1500,Highly Shareable
3,531.0,0.385965,-0.369697,1200,Highly Shareable
4,1072.0,0.411127,-0.220192,505,Somewhat Shareable
5,370.0,0.35061,-0.195,855,Somewhat Shareable
6,960.0,0.402039,-0.224479,556,Somewhat Shareable
7,989.0,0.42772,-0.242778,891,Somewhat Shareable
8,97.0,0.566667,-0.125,3600,Highly Shareable
9,231.0,0.298413,-0.238095,710,Somewhat Shareable


## Split the Data into Training and Testing Sets

In [6]:
# Create our features
X = condensed_online_news_df.drop("Popularity", axis=1)
X = X.drop(" shares", axis=1)

X = pd.get_dummies(X)
# Create our target
y = condensed_online_news_df["Popularity"]
X

Unnamed: 0,n_tokens_content,avg_positive_polarity,avg_negative_polarity
0,219.0,0.378636,-0.350000
1,255.0,0.286915,-0.118750
2,211.0,0.495833,-0.466667
3,531.0,0.385965,-0.369697
4,1072.0,0.411127,-0.220192
...,...,...,...
39639,346.0,0.333791,-0.260000
39640,328.0,0.374825,-0.211111
39641,442.0,0.307273,-0.356439
39642,682.0,0.236851,-0.205246


In [7]:
X.describe()

Unnamed: 0,n_tokens_content,avg_positive_polarity,avg_negative_polarity
count,39644.0,39644.0,39644.0
mean,546.514731,0.353825,-0.259524
std,471.107508,0.104542,0.127726
min,0.0,0.0,-1.0
25%,246.0,0.306244,-0.328383
50%,409.0,0.358755,-0.253333
75%,716.0,0.411428,-0.186905
max,8474.0,1.0,0.0


In [8]:
# Check the balance of our target values
y.value_counts()

Highly Shareable       27162
Somewhat Shareable     11335
Less Shareable          1089
Extremely Shareable       58
Name: Popularity, dtype: int64

In [9]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X,
    y, random_state=1, stratify=y)

## Combination Sampling

In [10]:
# Resample the training data with SMOTEENN
smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter(y_resampled)

Counter({'Extremely Shareable': 14395,
         'Highly Shareable': 5519,
         'Less Shareable': 15285,
         'Somewhat Shareable': 8093})

In [11]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=1)

In [12]:
# Display the confusion matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[   5,    0,    9,    0],
       [1901,  156, 4734,    0],
       [  71,    0,  201,    0],
       [ 798,   45, 1991,    0]], dtype=int64)

In [13]:
# Calculate the balanced accuracy score
balanced_accuracy_score(y_test, y_pred)

0.2797712563526368

In [14]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                           pre       rec       spe        f1       geo       iba       sup

Extremely Shareable       0.00      0.36      0.72      0.00      0.51      0.25        14
   Highly Shareable       0.78      0.02      0.99      0.04      0.15      0.02      6791
     Less Shareable       0.03      0.74      0.30      0.06      0.47      0.23       272
 Somewhat Shareable       0.00      0.00      1.00      0.00      0.00      0.00      2834

        avg / total       0.53      0.04      0.97      0.03      0.12      0.02      9911



  _warn_prf(average, modifier, msg_start, len(result))
