In [None]:
import numpy as np
import pandas as pd
from scipy.special import softmax
from scipy.stats import f_oneway
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, curdoc
from bokeh.palettes import Category10
from datetime import datetime
curdoc().theme = "caliber"
output_notebook()
packages = pd.read_csv("../input/citadel-datathon-west-coast-regional-fall-2021/packages.csv")
daily_pageviews = pd.read_csv("../input/citadel-datathon-west-coast-regional-fall-2021/analytics_daily_pageviews.csv", thousands=',')
daily_users = pd.read_csv("../input/citadel-datathon-west-coast-regional-fall-2021/analytics_daily_users.csv", thousands=',', parse_dates=[0])
country_users = pd.read_csv("../input/citadel-datathon-west-coast-regional-fall-2021/analytics_country_data.csv", thousands=',')

# data cleaning
# packages = packages.sort_values("created_at").set_index("created_at")
# packages = packages.sort_values("created_at")
daily_users.session_duration = pd.to_timedelta(daily_users.session_duration)
daily_users.bounce_rate = daily_users.bounce_rate.str.rstrip('%').astype(np.float) / 100.0
daily_users = daily_users.set_index("date")

# Daily Users

In [None]:
daily_users

In [None]:
fig = figure(title="Daily User Analytics", x_axis_label='date', x_axis_type='datetime', y_axis_label='number', sizing_mode="stretch_width", height=400)

cols = ["users", "new_users", "pageviews"]
for col, color in zip(cols, Category10[len(cols)]):
    fig.line(daily_users.index, daily_users[col], legend_label=col, line_color=color)

show(fig)

In [None]:
fig = figure(title="Daily User Analytics", x_axis_label='date', x_axis_type='datetime', y_axis_label='number', sizing_mode="stretch_width", height=400)
cols = ["sessions_per_user", "pages_per_session", "bounce_rate"]
for col, color in zip(cols, Category10[len(cols)]):
    fig.line(daily_users.index, daily_users[col], legend_label=col, line_color=color)
show(fig)

# Packages

In [None]:
cols = ["created_at", "test_id", "headline", "image_id", "pred", "click_rate", "first_place", "winner"]
data = pd.read_csv("../input/citadel-datathon-west-coast-regional-fall-2021/clean_data_filtered.csv", parse_dates=["created_at"])[cols].convert_dtypes()
# data["click_rate"] = data.clicks / data.impressions
data = data.sort_values("created_at").reset_index(drop=True)

In [None]:
type_map = {col: float for col in ['click_rate', 'performance', 'vadneg', 'vadneu', 'vadpos', 'wneg', 'wpos', 'wneu', 'sneg', 'spos', 'sneu',
    'posemo', 'negemo', 'anx', 'anger', 'sad', 'HarmVirtue', 'HarmVice', 'FairnessVirtue', 'FairnessVice', 'IngroupVirtue', 'IngroupVice', 'AuthorityVirtue', 'AuthorityVice', 'PurityVirtue', 'PurityVice']}
type_map["pred"] = int
data = data.astype(type_map)
data.dtypes

In [None]:
# pd.DataFrame(data.pred).corrwith(data)
data.corr().sort_values("pred")

In [None]:
# def performance(x):
#     expsum = np.exp(x).sum()
# #     print(expsum)
#     outs = np.array([np.exp(v) / expsum for v in x])
#     outs -= outs.mean()
#     return outs

In [None]:
# data["performance"] = data.groupby("test_id").click_rate.transform(performance)

In [None]:
# data = data[data.groupby("test_id").image_id.transform(lambda x: x.nunique() == 1)]

In [None]:
data.dtypes

In [None]:
# fig = figure(title="Daily User Analytics", x_axis_label='date', x_axis_type='datetime', y_axis_label='number', sizing_mode="stretch_width", height=400)
# fig.circle(data.created_at, data.performance, legend_label="performance")
# show(fig)

In [None]:
x = data.groupby("created_at").pred.mean()
fig = figure(title="1", x_axis_label='date', x_axis_type='datetime', y_axis_label='number', sizing_mode="stretch_width", height=400)
fig.line(x.index, list(x), legend_label="mean(pred)")
show(fig)

In [None]:
# split datasets to 4 parts
cut_date = datetime(2015, 2, 13)
pre_data, post_data = data[data.created_at < cut_date], data[data.created_at >= cut_date]
pre_no_override, pre_override = pre_data[pre_data.first_place == pre_data.winner], pre_data[pre_data.first_place != pre_data.winner]
post_no_override, post_override = post_data[post_data.first_place == post_data.winner], post_data[post_data.first_place != post_data.winner]
print(pre_data.shape, post_data.shape, pre_no_override.shape, pre_override.shape, post_no_override.shape, post_override.shape)

In [None]:
X = [pre_data.pred, pre_override.pred, pre_no_override.pred, pre_override.pred, post_override.pred]
Y = [post_data.pred, post_override.pred, post_no_override.pred, pre_no_override.pred, post_no_override.pred]
for x, y in zip(X, Y):
    res = f_oneway(x, y)
    effect = y.mean() - x.mean()
    print(res, "effect =", effect)

In [None]:
X = [pre_data.pred, pre_override.pred, pre_no_override.pred, pre_override.pred, post_override.pred]
Y = [post_data.pred, post_override.pred, post_no_override.pred, pre_no_override.pred, post_no_override.pred]
for x, y in zip(X, Y):
    res = f_oneway(x, y)
    effect = y.mean() - x.mean()
    print(res, "effect =", effect)

In [None]:
# data.to_csv("dataset.csv")

# Sanity Check

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
data = pd.read_csv("../input/citadel-datathon-west-coast-regional-fall-2021/fake_news.csv")

data = data[data.y != 2]
data.dtypes

In [None]:
pd.set_option('display.max_columns', None)
data.describe()

In [None]:
X, y = data.drop("y", axis="columns"), data.y
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
model = LinearSVC(C=0.005995)
model.fit(X_train, y_train)
pred = (model.predict(X_test) > 0.5)
((model.predict(X_test) > 0.5) == y_test).sum()

In [None]:
model.score(X_test, y_test)

In [None]:
model = LinearSVC(C=0.005995)
model.fit(X, y)
model.score(X, y)

In [None]:
model.coef_

In [None]:
df = pd.DataFrame(zip(data.drop("y", axis="columns").columns, model.coef_[0]))

In [None]:
df.sort_values(1)

In [None]:
data.y.value_counts()

In [None]:
model.predict()

# Sanity Check for features

In [None]:
from glob import glob
from nltk import tokenize, word_tokenize

In [None]:
with open("lexicon.lex", 'r') as f:
    lexicon = f.read().split("\n")


In [None]:
def func(words):
    return sum(int(w in lexicon) for w in words) / len(words)

l = []
cnt = 0
slen = 0
for path in glob("real/*.txt"):
    with open(path, "r", encoding='utf-8', errors='ignore') as f:
        words = word_tokenize(f.read().lower())
#         print(words)
        l.append((func(words), 0))
        cnt += func(words)
        slen += len(words)
        
print("real:", cnt, slen / len(glob("real/*.txt")))

cnt = 0
slen = 0
for path in glob("fake/*.txt"):
    with open(path, "r", encoding='utf-8', errors='ignore') as f:
        words = word_tokenize(f.read().lower())
        l.append((func(words), 1))
        cnt += func(words)
        slen += len(words)
        
print("fake:", cnt, slen / len(glob("fake/*.txt")))


In [None]:
df = pd.DataFrame(l)
df

In [None]:
from scipy.stats import pearsonr
pearsonr(df[0], df[1])

In [None]:
!ls real/
!ls fake/

In [None]:
s = word_tokenize("This simple calculator tells you how each presidential candidate's tax plan affects you".lower())
sum(int(w in lexicon) for w in s)

In [None]:
func(['more', 'than', '160', 'republicans', 'do', "n't", 'support', 'donald', 'trump'])

# Graph

In [None]:
from scipy.signal import savgol_filter

In [None]:
cols = ["created_at", "test_id", "headline", "image_id", "pred", "click_rate", "first_place", "winner"]
data = pd.read_csv("../input/citadel-datathon-west-coast-regional-fall-2021/clean_data_filtered.csv", parse_dates=["created_at"])[cols].convert_dtypes()
data = data.sort_values("created_at").reset_index(drop=True)
override, no_override = data[data.first_place != data.winner], data[data.first_place == data.winner]

daily_users = pd.read_csv("../input/citadel-datathon-west-coast-regional-fall-2021/analytics_daily_users.csv", thousands=',', parse_dates=[0])
daily_users.session_duration = pd.to_timedelta(daily_users.session_duration)
daily_users.bounce_rate = daily_users.bounce_rate.str.rstrip('%').astype(np.float) / 100.0
daily_users = daily_users.set_index("date")
daily_users.dtypes

In [None]:
fig = figure(title="", x_axis_label='', x_axis_type='datetime', y_axis_label='% of Clickbaits', sizing_mode="stretch_width", height=400)

y1, y2 = override.groupby("created_at").pred.mean().astype(float), no_override.groupby("created_at").pred.mean().astype(float)
y1 = y1.reindex(y2.index, fill_value=0.)
y1_smooth, y2_smooth = y1.rolling(50, center=True).mean(), y2.rolling(50, center=True).mean()
users = daily_users.rolling(1).mean()
print(y1_smooth.shape, y2_smooth.shape)
fig.line(y1_smooth.index, y1_smooth, legend_label="override", color="red", line_width=2)
fig.line(y2_smooth.index, y2_smooth, legend_label="no override", color="blue", line_width=2)
fig.varea(y1_smooth.index, y1_smooth, y2_smooth, color="blanchedalmond")
fig.line(users.index, savgol_filter(users.users / 1e7, 41, 3), legend_label="active users (1e7)", color="green", line_width=1)
# fig.line(y1_smooth.index, y1_smooth - y2_smooth)
# fig.legend.label_text_font = ""
fig.legend.label_text_font_size = "15px"
show(fig)

In [None]:
from bokeh.plotting import output_file, save
output_file(filename="custom_filename.html", title="Static HTML file")
save(fig)

In [None]:
y1_smooth - y2_smooth