In [28]:
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns

from tqdm import tqdm

from sklearn.model_selection import train_test_split

import re
import spacy
from spacy.tokens import DocBin

pd.options.display.max_colwidth = 300

In [9]:
def json_tweets_to_pd(jsonfile):
    '''
    read paginated json file to df
    input (string): .json file created by searchtweets module
    output (pd.DataFrame)
    '''
    raw_df = pd.read_json(jsonfile, lines=True)
    df = pd.DataFrame()
    for page in raw_df['data']:
        df = pd.concat([df, pd.DataFrame(page)])
    return df

# Creating training set

In [7]:
!ls $PWD/get_tweets

get_tweets.ipynb         search_stock.yaml        tweets_positive.json
merge_tweets.ipynb       search_with_query.yaml   tweets_positive_old.json
search_positive.yaml     [1m[36mtweets_data[m[m


In [13]:
df_positive = json_tweets_to_pd('get_tweets/tweets_positive.json')

# drop tweets withheld by the twitter accounts
df_positive = df_positive.drop(df_positive[df_positive.withheld.notnull()].index)

df_positive.drop(columns=['id', 'withheld'], inplace=True)
df_positive = df_positive.astype({'text':'string'})
df_positive.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15464 entries, 0 to 499
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    15464 non-null  string
dtypes: string(1)
memory usage: 241.6 KB


In [14]:
df_tweets = pd.read_csv('data/tweets_dow.csv', usecols = ['text'], dtype={'text':'string'})
df_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186204 entries, 0 to 186203
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    186204 non-null  string
dtypes: string(1)
memory usage: 1.4 MB


In [15]:
n_positive = len(df_positive)
df_positive['climate_related'] = np.ones(n_positive, dtype=bool)

df_negative = df_tweets.sample(n_positive, random_state = 2022)

In [16]:
# keywords = ["climate action", "environment", "climate change", "renewable", "recycle", "earth", "emission", "carbon footprint", ...]
with open("keywords.txt", "r") as f:
    keywords = [line.rstrip() for line in f]
negative = df_negative.text.apply(lambda text: not any([keyword in text.lower() for keyword in keywords]))

In [17]:
df_negative = df_negative[negative]
df_negative['climate_related'] = np.zeros(len(df_negative), dtype=bool)

In [18]:
df = pd.concat([df_positive, df_negative]).sample(frac=1)

# Preprocessing

In [19]:
def clean_up(tweet): 

    parsed_tweet = tweet
    
    # remove url
    parsed_tweet = re.sub(r"\S*https?:\S*", "", parsed_tweet, flags=re.MULTILINE)

    # remove non-ascii
    parsed_tweet = re.sub(r"[^\x00-\x7F]+", "", parsed_tweet)

    # remove 'RT'
    parsed_tweet = re.sub(r"^RT\s", "", parsed_tweet)
    return parsed_tweet

In [20]:
def preprocess(df, embed):
    '''
    Preprocess the dataframe into spacy pipeline for later classification
    ---
    Input:
    df (DataFrame): Pandas dataframe containing the raw text and outputs.
    embed (str): Name of pipeline embedding used

    Output:
    df (DataFrame): Preprocessed input dataframe
    docs (doc): SpaCy doc object that stores text data along with classification
    '''

    # clean up tweets
    df.text = df.text.apply(clean_up)

    # Store the data into tuples
    data = tuple(zip(df.text.tolist(), df.climate_related.tolist())) 
    
    # Load English library from SpaCy
    nlp = spacy.load(embed)
    print(data[0])

    # Storage for docs
    docs = []

    # One-hot encoding for the classifications
    for doc, label in tqdm(nlp.pipe(data, as_tuples=True), total = len(data)):
        
        if label:
            doc.cats['climate_related'] = 1
        else:
            doc.cats['climate_related'] = 0
        # print(doc.cats)
        
        docs.append(doc)
    return df, docs

In [21]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2)
train_df, valid_df = train_test_split(df, test_size=0.2)

print("Train:",len(train_df), "Valid:", len(valid_df), "Test:", len(test_df))

Train: 24430 Valid: 6108 Test: 6108


In [25]:
# Covert the train and valid dataframes to .spacy files for training

# need to install one of the models to currently activated conda env by running:
# python -m spacy download en_core_web_sm
# python -m spacy download en_core_web_md
# python -m spacy download en_core_web_lg
# python -m spacy download en_core_web_trf



# embed = "en_core_web_sm" # small
# embed = "en_core_web_md" # middle
embed = "en_core_web_lg" # large
# embed = "en_core_web_trf" # roberta

# Preprocess the dataframes for train data
train_data, train_docs = preprocess(train_df, embed)
# Save data and docs in a binary file to disc
doc_bin = DocBin(docs=train_docs)
doc_bin.to_disk("./spacy_data/textcat_train.spacy")

# Preprocess the dataframes for test data
valid_data, valid_docs = preprocess(valid_df, embed)
# Save data and docs in a binary file to disc
doc_bin = DocBin(docs=valid_docs)
doc_bin.to_disk("./spacy_data/textcat_valid.spacy")

('Smart home tech can help prevent/detect a fire in your home. Learn how:  #ThinkSafe ', False)


100%|██████████| 24430/24430 [01:13<00:00, 334.65it/s]


('PODCAST: VP Kathryn Karol shares insight on CEO Jim Umpleby, our policy priorities and the new administration.  ', False)


100%|██████████| 6108/6108 [00:19<00:00, 309.00it/s]


In [26]:
!python -m spacy init config "./config.cfg" --lang en --pipeline textcat_multilabel --optimize efficiency --force

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: textcat_multilabel
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [27]:
!python -m spacy train ./config.cfg --output ./output --paths.train ./spacy_data/textcat_train.spacy --paths.dev ./spacy_data/textcat_valid.spacy --verbose

[2022-06-15 01:49:58,893] [DEBUG] Config overrides from CLI: ['paths.train', 'paths.dev']
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using CPU[0m
[1m
[2022-06-15 01:49:59,624] [INFO] Set up nlp object from config
[2022-06-15 01:49:59,632] [DEBUG] Loading corpus from path: spacy_data/textcat_valid.spacy
[2022-06-15 01:49:59,633] [DEBUG] Loading corpus from path: spacy_data/textcat_train.spacy
[2022-06-15 01:49:59,633] [INFO] Pipeline: ['textcat_multilabel']
[2022-06-15 01:49:59,636] [INFO] Created vocabulary
[2022-06-15 01:49:59,704] [INFO] Finished initializing nlp object
[2022-06-15 01:50:10,308] [INFO] Initialized pipeline components: ['textcat_multilabel']
[38;5;2m✔ Initialized pipeline[0m
[1m
[2022-06-15 01:50:10,318] [DEBUG] Loading corpus from path: spacy_data/textcat_valid.spacy
[2022-06-15 01:50:10,318] [DEBUG] Loading corpus from path: spacy_data/textcat_train.spacy
[2022-06-15 01:50:10,324] [DEBUG] Removed existing output directory: output/model-best
[2