In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("OnlineNewsPopularity.csv")
df.columns = df.columns.str.lstrip()
pd.options.display.max_columns = None


In [None]:
days = {'monday' : 1, 'tuesday': 2, 'wednesday' : 3, 'thursday' : 4, 'friday' : 5, 'saturday': 6, 'sunday' : 7}


In [None]:
df.describe()

In [None]:
df['shares'].plot(kind = 'line', figsize = (10, 10))
plt.plot()
#as is clear from the plot there are some spikes in the data

In [None]:
def Remove_Outlier_Indices(news_popularity):
    Q1 = news_popularity['shares'].quantile(0.25)
    Q3 = news_popularity['shares'].quantile(0.75)
    IQR = Q3 - Q1
    trueList = ~((news_popularity['shares'] < (Q1 - 1.5 * IQR)) |(news_popularity['shares'] > (Q3 + 1.5 * IQR)))
    return trueList

index_news_outlier = Remove_Outlier_Indices(df)
df = df[index_news_outlier]
temp_df = df.copy() #copy of the original dataframe

In [None]:
#these histograms just represent the range of value vs frequency of their occurence
df.hist(figsize = (30, 30))
plt.plot()

In [None]:
sns.boxplot(x=df['shares'])
plt.show()
#there are still some outliers in the data, but since the the dataframe is large enough they aren't going to effect our results

In [None]:
sns.distplot(df['shares'], kde = False)
plt.show()
sns.distplot(df['shares'], hist = False)
plt.show()
sns.distplot(df['shares'])
plt.show()
#this function displays histogram of the values of share and its gussian kernel density extimation
#HISTOGRAM is the range of values vs their frequency
#KDE is the function that is used to smooth the curve of the histrogram
#KDE give a much accurate estimation of the data
#Most of the articles get shared 1000-2000 times then there is an exponetial decrease

In [None]:
day_wise_count = []
day_of_week = list(map(lambda x : f"weekday_is_{x}", days))

In [None]:
for item in day_of_week:
    day_wise_count.append(df[item].sum())

In [None]:
print(day_wise_count)

In [None]:
plt.figure(figsize=(25, 10))
plt.grid(axis='y')
plt.bar(day_of_week,day_wise_count)
plt.xticks()

plt.show()
#weekday are the most active days

In [None]:
plt.figure(figsize=(25, 25))
plt.scatter(df['n_tokens_content'], df['shares'])
plt.show()
#article with too long don't get much shares, articles with less than 2000 word in their titles gets most shares

In [None]:
plt.figure(figsize=(25,15))
plt.scatter(df['num_imgs'], df['shares'])
plt.show()

In [None]:
plt.figure(figsize=(25,15))
plt.scatter(df['avg_positive_polarity'], df['shares'])
plt.show()

In [None]:
data_channels = {'entertainment' : 1, 'bus' : 2, 'socmed' : 3, 'tech': 4, 'lifestyle': 5}
data_channel_is = list(map(lambda x : f"data_channel_is_{x}", data_channels))

In [None]:
data_channels_count = list(df[item].sum() for item in data_channel_is)

In [None]:
colors = ['darkblue', 'red', 'purple', 'grey', 'black']
explode = [0.1,0,0,0.1,0]
plt.pie(data_channels_count, explode=explode, labels = data_channels, startangle=90, autopct='%1.2f%%', colors= colors)
plt.show()

In [None]:
def func(x, day):
    if x == 1:
        return days[day]
    else:
        return 0

for day in days:
    df[f'weekday_is_{day}'] = df[f'weekday_is_{day}'].apply(lambda x : func(x, day))


In [None]:
df['weekdays'] = df['weekday_is_monday'] * 0

for day in days:
    df['weekdays'] += df[f'weekday_is_{day}']
    temp_df.pop(f"weekday_is_{day}") 

In [None]:
import numpy as np
from matplotlib import cm

colors = cm.Set1(np.arange(7)/7.)
plt.pie(day_wise_count, labels=days, autopct='%1.2f%%', colors= colors)
plt.show()

In [None]:
def func(x, channel):
    if x == 1:
        return data_channels[channel]
    else:
        return 0

for data_channel in data_channels:
    df[f'data_channel_is_{data_channel}'] = df[f'data_channel_is_{data_channel}'].apply(lambda x : func(x, data_channel))

In [None]:
df['data_channel'] = df['data_channel_is_tech'] * 0

for channel in data_channels:
    df['data_channel'] += df[f'data_channel_is_{channel}']
    temp_df.pop(f'data_channel_is_{channel}')

In [None]:
temp_df['weekdays'] = df['weekdays']
temp_df['data_channel'] = df['data_channel']

#temp_df is just the original dataframe but in compressed format, just so machine learning models can work on it easily.

dftrain = temp_df.iloc[:30000, :]
dfeval = temp_df.iloc[30000: , :]
y_train = dftrain.pop('shares')
y_eval = dfeval.pop('shares')

In [None]:
temp_df.pop('url')

In [None]:
LDA = ['00', '01', '02', '03' ,'04']
for element in LDA:
    temp_df.pop(f'LDA_{element}')

In [None]:
import tensorflow.compat.v2.feature_column as fc
import tensorflow as tf

In [None]:
numericColumn = list(dftrain.columns)
feature_columns = []

for feature_name in numericColumn:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))


In [None]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

In [None]:
ds = make_input_fn(dftrain, y_train, batch_size=10)()
for feature_batch, label_batch in ds.take(1):
  print('Some feature keys:', list(feature_batch.keys()))
  print()
  print('A batch of class:', feature_batch['n_tokens_title'].numpy())
  print()
  print('A batch of Labels:', label_batch.numpy())

In [None]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result)