In [1]:
import pandas as pd

from sentence_transformers import SentenceTransformer

In [2]:
# Download News_Final.csv to data folder: https://archive.ics.uci.edu/ml/machine-learning-databases/00432/Data/

DATA_PATH = "data/"
FILENAME = "News_Final.csv"
MEDIUM = "Facebook"

MAX_SHARES = 500 # limit for the dependent variable

In [3]:
def read_data():
    df_original = pd.read_csv(DATA_PATH + FILENAME)
    print("Original dataset size:", df_original.shape)
    df_original.head()

    # Filter by 10 most frequent sources
    frequent_sources = list(df_original['Source'].value_counts()[:10].index)
    cols = ['Topic', 'Source', 'SentimentHeadline', 'SentimentTitle', 'Headline', MEDIUM]
    df = df_original[cols]
    df = df[df[MEDIUM] != -1]
    df = df[df['Source'].isin(frequent_sources)]
    print("Processed dataset size:", df.shape)
    return df


df = read_data()
df.head()

Original dataset size: (93239, 11)
Processed dataset size: (8820, 6)


Unnamed: 0,Topic,Source,SentimentHeadline,SentimentTitle,Headline,Facebook
736,economy,Business Insider,-0.378927,-0.055902,The US economy had a blockbuster October. US c...,27
741,economy,Bloomberg,0.0,-0.079057,Zimbabwe freed its economy from the nightmare ...,61
748,economy,Bloomberg,-0.02983,-0.236111,Vietnam's export-driven economy is set to grow...,31
751,economy,The Guardian,-0.10229,-0.208333,The likelihood of employers offering low wage ...,20
752,economy,Reuters,0.005231,0.047246,(Repeats Sunday story with no changes to text)...,5


In [4]:
def preprocess(df):
    df = df[df[MEDIUM] <= MAX_SHARES]
    df_sample = df.sample(n=df.shape[0], random_state=1).reset_index(drop=True)
    df_sample = pd.concat([pd.get_dummies(df_sample['Topic']), pd.get_dummies(df_sample['Source']), df_sample[['Headline', 'SentimentHeadline', 'SentimentTitle', MEDIUM]]], axis=1)
    return df_sample

df_sample = preprocess(df)
df_sample.head()

Unnamed: 0,economy,microsoft,obama,palestine,ABC News,Bloomberg,Business Insider,CNN,Economic Times,Forbes,New York Times,Reuters,The Guardian,Washington Post,Headline,SentimentHeadline,SentimentTitle,Facebook
0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,Singapore’s economy expanded more than economi...,-0.231503,0.075,86
1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,China's economy has seen positive changes sinc...,-0.054301,0.170139,6
2,1,0,0,0,0,1,0,0,0,0,0,0,0,0,"Austan Goolsbee, a University of Chicago Booth...",0.094341,0.0,2
3,0,0,1,0,0,0,0,0,0,0,0,0,0,1,What are you looking for? The extent to which ...,-0.079167,-0.040447,171
4,1,0,0,0,0,0,0,0,0,0,0,0,1,0,A woman walks past at an electronic board in T...,-0.02665,0.050397,7


# Headline embeddings

In [5]:
def add_embeddings(df):

    model = SentenceTransformer('bert-base-uncased')

    sentences = list(df_sample['Headline'])
    sentences = [s.lower().replace('\"', '') for s in sentences]
    embeddings = model.encode(sentences)
    print("Embeddings size:", embeddings.shape)
    
    df_embeddings = pd.concat([df, pd.DataFrame(embeddings)], axis=1)
    df_embeddings = df_embeddings.drop(columns=['Headline'], axis=1)
    
    return df_embeddings

df_embeddings = add_embeddings(df_sample)
df_embeddings.head()


# Save
# df_embeddings.to_csv(f"{DATA_PATH}/news_final_{MEDIUM}.csv", index=False)

Embeddings size: (7948, 768)


Unnamed: 0,economy,microsoft,obama,palestine,ABC News,Bloomberg,Business Insider,CNN,Economic Times,Forbes,...,758,759,760,761,762,763,764,765,766,767
0,1,0,0,0,0,1,0,0,0,0,...,-0.072151,0.121403,0.14478,0.01522,0.350567,-0.544805,-0.035569,-0.109052,-0.027532,-0.585209
1,1,0,0,0,0,0,0,0,0,0,...,0.209577,0.066699,0.298735,-0.094895,0.057249,-0.218789,-0.017792,-0.450655,0.190671,-0.455926
2,1,0,0,0,0,1,0,0,0,0,...,-0.108347,-0.309664,-0.160288,-0.121127,0.142184,-0.259542,-0.021993,0.000632,0.143739,-0.074407
3,0,0,1,0,0,0,0,0,0,0,...,0.006113,0.090407,0.185474,-0.271995,0.230278,-0.540464,0.233256,-0.412061,0.124626,0.189391
4,1,0,0,0,0,0,0,0,0,0,...,0.048723,-0.001103,-0.078952,-0.140065,0.004386,0.036686,0.092792,-0.236575,-0.077332,-0.299455
