In [1]:
from app.models import Session, Headline, Agency, Article
from app.utils import Country, Bias, Credibility
from app.analysis.clustering import prepare_cosine, form_clusters, label_clusters
import pandas as pd

In [2]:
with Session() as s:
    data = s.query(Headline.id, Headline.processed, Headline.first_accessed, Article.url, Agency.name, Agency._country, Agency._bias, Agency._credibility).join(Headline.article).join(Article.agency).all()
    df = pd.DataFrame(data, columns=['id', 'headline', 'date', 'url', 'agency', 'country', 'bias', 'credibility'])

In [3]:
df.sample(15)

In [4]:
from app.utils import Config
df = df[(df['country'] == Country.us.value) | ((df['country'] == Country.gb.value) & (df['agency'].isin(Config.exempted_foreign_media)))]

In [5]:
import pytz
# Cast date as utc and then convert it to the NYC eastern
df['day'] = df['date'].apply(lambda x: x.date())
df.sample(5)

In [6]:
# SPlit the df into dfs for days
dfs = [df[df['day'] == day].copy() for day in df['day'].unique()]
dfs[0].head()

In [7]:
n_samples = 10
threshold = 0.5
dfs = [label_clusters(df, form_clusters(prepare_cosine(df['headline']), n_samples, 0.5)) for df in dfs]
df = pd.concat(dfs)
df.sample(5)

In [8]:
# Drop -1's from the cluster column
df = df[df['cluster'] != -1]
# Combine day and clusterid to get a unique identifier
df['clusterid'] = df['day'].astype(str) + df['cluster'].astype(str)

# Drop duplicates within each cluster (sometimes an agency will have more than one headline in a cluster)
df = df.drop_duplicates(subset=['clusterid', 'agency'])
# Drop clusters with less than 10 agencies
df = df.groupby('clusterid').filter(lambda x: len(x) >= 10)

In [9]:
# Now we want to count entries per day
counts = df.groupby('day').size()

In [10]:
# Rename unnamed
counts.rename('count', inplace=True)
counts.sample(10)

In [11]:
import matplotlib.pyplot as plt

# Now plot the counts
plt.figure(figsize=(10, 5))
counts.plot()


In [14]:
counts = counts.reset_index()
counts.sample(10)

In [17]:
# Let's take the median, then we mark slow and busy news days based on the median
median = counts.median()
counts['slow'] = counts['count'] < median
counts.sample(10)

In [18]:
median

In [22]:
counts['count']

In [23]:
import pandas as pd

# Example DataFrame
data = {'count': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
counts = pd.DataFrame(data)

# Calculate the median
median = counts['count'].median()
print("Median is:", median)

# Create a new column based on the median
counts['slow'] = counts['count'] < median

# Sample the DataFrame
print(counts.sample(10))