In [1]:
import pandas as pd
import numpy as np
import glob
import random
import re
import nltk
from collections import defaultdict
from nltk.corpus import stopwords
from sklearn.cluster import SpectralClustering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import silhouette_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

import multiprocessing
import matplotlib.pyplot as plt
from matplotlib import pyplot
from gensim.models import Word2Vec, KeyedVectors
from gensim.test.utils import datapath, get_tmpfile
from gensim.scripts.glove2word2vec import glove2word2vec
from csv import DictWriter
from collections import Counter
from dateutil import parser
import warnings 
warnings.filterwarnings(action = 'ignore') 

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Labeling from scratch

Since we don't have any labeled data at the beginning of our analysis, we need to define the classes we're interested in and create initial labeled data from scratch.

## Defining research classes for initial labeling

* label = 0: not related to real bears
* label = 1: related to real bears, and even related to encountering real bears
* label = 2: related to real bears, but not related to encountering real bears

Class 1 is the most important category as it will be used to analyze the frequency and geographical distribution of bear encounters. Class 2 can be used as a supplementary category to evaluate people's emotional responses to bear encounters.

## Define reuseable variables and functions

Based on the bear-encountering ratio displayed by [iNaturalist](https://github.com/persecond17/Black_Bear_CDFW2023/blob/main/social_media_NLP_project/step_2_preprocessing/3_visualizations.ipynb), we have a reference distribution to follow. If the input record is classified as class 1 and meets both the Geo Quota and seasonal quota requirements, then we can include it in the labeled dataset.


In [2]:
geo_quota = {'Humboldt':220, 'Tulare':80, 'Los Angeles':80, 'Mariposa':70, \
             'El Dorado':60, 'Tuolumne':50, 'other':440}
season_quota = {'spring':70, 'summer':330, 'autumn':430, 'winter':170}

In [3]:
titles = [0, 'index','tweet_id','created_datetime','content','author_id','place_id',
          'location','longitude','latitude','county','label','season']

In [4]:
def season_identify(date):
    """
    Identify the posted season of record. 
    """
    month = int(date[5:7])
    if month < 4: return 'spring'
    elif month < 7: return 'summer'
    elif month < 10: return 'autumn'
    elif month <= 12: return 'winter'

In [5]:
def county_identify(series):
    """
    Count the number of occurrences of each county 
    in the series, and return the dictionary with 
    the count of each county.
    """
    temp = {'Humboldt':0, 'Tulare':0, 'Los Angeles':0, 'Mariposa':0, \
            'El Dorado':0, 'Tuolumne':0, 'other':0}
    for c in series:
        if c == 'Humboldt': temp['Humboldt'] += 1
        elif c == 'Tulare': temp['Tulare'] += 1
        elif c == 'Los Angeles': temp['Los Angeles'] += 1
        elif c == 'Mariposa': temp['Mariposa'] += 1
        elif c == 'El Dorado': temp['El Dorado'] += 1
        elif c == 'Tuolumne': temp['Tuolumne'] += 1
        else: temp['other'] += 1
    return temp

In [6]:
def control_sampling_ratio(line, geo, season, labeled_df, geo_quota, season_quota):
    """
    Check if a tweet is a duplicate and if it meets certain 
    quotas (based on geographical and seasonal information),
    and returns the tweet if it does not violate any of conditions.
    """
    geo_dict = Counter(labeled_df['county'])
    season_dict = Counter(labeled_df['season'])

    if line.tweet_id in labeled_df.tweet_id:
        print('Duplicate!')
        return None
    if geo in geo_quota.keys():
        if geo_dict[geo] >= geo_quota[geo]:
            print('Geo Quota Full!')
            return None
    else:
        if geo_dict['other'] >= geo_quota['other']:
            print('Geo Quota Full!')
            return None
    if season_dict[season] >= season_quota[season]:
        print('Seasonal Quota Full!')
        return None
    return line

In [7]:
def random_choose(target_df):
    """
    Random choose a record and parse it's 
    content, geo, and season information.
    """
    line = target_df.iloc[random.choice(target_df.index)]
    tweet = line.content
    geo = line.county
    season = season_identify(line.created_datetime)
    return line, tweet, geo, season

## Loading the Unlabeled and Labeled Datasets

Let's load the unlabeled data from [the previous step](https://github.com/persecond17/Black_Bear_CDFW2023/blob/main/social_media_NLP_project/step_3_data_annotation/1_filtering.ipynb), as well as a small labeled dataset to check the distribution and explore.

In [8]:
target_df = pd.read_csv('filtered.csv').reset_index().drop(columns=['Unnamed: 0', 'level_0'])
target_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167202 entries, 0 to 167201
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   index             167202 non-null  int64  
 1   tweet_id          167202 non-null  float64
 2   created_datetime  167202 non-null  object 
 3   content           167202 non-null  object 
 4   author_id         167202 non-null  float64
 5   place_id          162300 non-null  object 
 6   location          162223 non-null  object 
 7   longitude         167202 non-null  float64
 8   latitude          167202 non-null  float64
 9   county            167202 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 12.8+ MB


In [9]:
labeled_df = pd.read_csv('labeled.csv')
print(f"Currently, we have {len(labeled_df.loc[labeled_df['label']==1])} Class 1 records.")

Currently, we have 83 Class 1 records.


## Random Sampling, Quota Checking, Labeling, and Saving Records

In this section, I randomly selected records from the unlabeled dataset and manually labeled them. 

However, this process is time-consuming due to the low ratio of the target Class 1 tweets, which is **less than 5%** according to my statistics. Therefore, we aimed to collect as few manually labeled tweets as possible to trigger further automatic learning techniques.

In [10]:
def manually_labeling(label, line, geo, season, labeled_df, geo_quota, season_quota, titles):
    """
    Write qualified tweets (labeled as 1 and satisfying 
    both the geographical and seasonal quotas, or 
    labeled as 0 or 2) to the labeled database.
    """
    new_cols = pd.DataFrame({'label': label, 'season': season}, index=range(1))
    if label == 1:
        line = control_sampling_ratio(line, geo, season, labeled_df, geo_quota, season_quota)
        if len(line) > 0:
            row = pd.concat([line, new_cols])
        else: row = dict()
    else: 
        row = pd.concat([line, new_cols])

    # Open CSV file in append mode
    with open('labeled.csv', 'a') as f:
        writer = DictWriter(f, fieldnames=titles)
        # writer.writeheader() #only used in the 1st time
        writer.writerow(dict(row))
        print("Success saved!")

In [11]:
line, tweet, geo, season = random_choose(target_df)
tweet

"It's wedding season and so excited for the big day for Eveleen + Gus.  And you gotta love Cali..how cool is it to have snow for your engagement pictures.  Simply beautiful pc | michellelacsonphotography @ Big Bear… https://t.co/qXrG6Kvw8J"

In [12]:
label = 0 #0 or 1 or 2
manually_labeling(label, line, geo, season, labeled_df, geo_quota, season_quota, titles)

Success saved!


# Iterative Semi-Supervised Learning: Spectral Clustering + Ada Boosting

Let's mount the Google Drive folder to the Colab notebook and set the working directory to the "My Drive" folder first.

In [13]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
%cd /content/drive/My Drive/

Mounted at /content/drive/
/content/drive/My Drive


## Merge and preprocess labeled datasets

In [14]:
def merge_csv() -> pd.DataFrame:
    """
    Obtain a list of all CSV files in a given directory, 
    iterate through each CSV file and load it into a DataFrame. 
    Concatenate all the DataFrames into a single one. Parse dates 
    and remove any duplicate rows based on the tweet_id column.
    """
    csv_files = glob.glob('*.csv')
    df_list = []
    for file in csv_files:
        if file.startswith('sample_df'):
            df = pd.read_csv(file)
            df_list.append(df)

    df = pd.concat(df_list, axis=0, ignore_index=True)
    df['created_datetime'] = df['created_datetime'].apply(parser.parse)\
                                .apply(lambda x: x.strftime('%Y-%m-%d'))
    df['created_datetime'] = pd.to_datetime(df['created_datetime'])
    df.drop_duplicates(subset=['tweet_id'], inplace=True)
    return df

In [15]:
def text_clean_and_tokenize(text: str) -> object:
    '''
    Preprocess the text data by removing URLs, hashtags, 
    punctuations, and extra spaces. Convert uppercase 
    letters to lowercase, tokenize the words, remove 
    stopwords, and perform text lemmatization.
    '''
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'#\w+|\@\w+', '', text)
    text  = "".join((char if char.isalpha() else " ") for char in text)
    text = re.sub('[\W_]+', ' ', text)
    text = re.sub(' +', ' ',text)
    text = text.strip()
    text = text.lower()
    text = nltk.word_tokenize(text)#re.split('\W+', text)
    text = [word for word in text if word not in stop_words]

    ps = nltk.WordNetLemmatizer()
    text = [ps.lemmatize(w) for w in text]
    merged_text = ' '.join(text)
    return text, merged_text

To begin the analysis, we will read all of the labeled datasets `labeled_df` as well as the unlabeled dataset `new`. Then we can obtain a preliminary understanding of the number of positively labeled tweets as well as the timeframe in which they were posted.

Next, we will apply preprocessing techniques such as removing unnecessary parts, lowercasing, tokenizing, and lemmatization on tweets to ensure that the text data is clean and structured.

In [16]:
labeled_df = merge_csv()
print(labeled_df.groupby('label')['label'].count())
print(f"Start from {min(labeled_df.created_datetime)} to {max(labeled_df.created_datetime)}")

label
0    318
1    122
2     80
Name: label, dtype: int64
Start from 2010-06-07 00:00:00 to 2022-12-27 00:00:00


In [17]:
labeled_df['tokenized_tweets'] = labeled_df['content'].apply(lambda x: text_clean_and_tokenize(x))
labeled_df[['tokenized', 'merged']] = labeled_df['tokenized_tweets'].apply(lambda x: pd.Series(x))
labeled_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 520 entries, 0 to 519
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   tweet_id          520 non-null    float64       
 1   created_datetime  520 non-null    datetime64[ns]
 2   content           520 non-null    object        
 3   author_id         520 non-null    float64       
 4   place_id          515 non-null    object        
 5   location          515 non-null    object        
 6   longitude         520 non-null    float64       
 7   latitude          520 non-null    float64       
 8   county            520 non-null    object        
 9   label             520 non-null    int64         
 10  tokenized_tweets  520 non-null    object        
 11  tokenized         520 non-null    object        
 12  merged            520 non-null    object        
dtypes: datetime64[ns](1), float64(4), int64(1), object(7)
memory usage: 56.9+ KB


In [18]:
new = pd.read_csv("filtered.csv").drop(columns=['Unnamed: 0', 'index'])
new['tokenized_tweets'] = new['content'].apply(lambda x: text_clean_and_tokenize(x))
new[['tokenized', 'merged']] = new['tokenized_tweets'].apply(lambda x: pd.Series(x))
new['created_datetime'] = pd.to_datetime(new['created_datetime'])
new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 167202 entries, 0 to 167201
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   tweet_id          167202 non-null  float64       
 1   created_datetime  167202 non-null  datetime64[ns]
 2   content           167202 non-null  object        
 3   author_id         167202 non-null  float64       
 4   place_id          162300 non-null  object        
 5   location          162223 non-null  object        
 6   longitude         167202 non-null  float64       
 7   latitude          167202 non-null  float64       
 8   county            167202 non-null  object        
 9   tokenized_tweets  167202 non-null  object        
 10  tokenized         167202 non-null  object        
 11  merged            167202 non-null  object        
dtypes: datetime64[ns](1), float64(4), object(7)
memory usage: 15.3+ MB


## Vectorizing Data using Pre-Trained [GloVe](https://nlp.stanford.edu/projects/glove/) Model

In this chapter, we will be leveraging a pre-trained GloVe (Global Vectors for Word Representation) model to convert textual data into numerical vectors. GloVe is an unsupervised learning algorithm that generates word embeddings from a large corpus of text, enabling us to represent each word in our labeled tweets as a high-dimensional vector that reflects its meaning in the context of the corpus. These vectors can then be used for further analysis, such as spectral clustering and Ada Boosting.

First, let'sload a pre-trained GloVe word embedding model and convert it to the word2vec format. **Glove.twitter.27B.200d.txt** is a pre-trained word embedding model that has been trained on a large corpus of Twitter data. It contains 27 billion tokens and 1.2 million vocabulary words, each with a 200-dimensional vector representation.

In [19]:
filepath = "glove.twitter.27B.200d.txt"
tmp_file = get_tmpfile(filepath)
_ = glove2word2vec(filepath, tmp_file)
glove = KeyedVectors.load_word2vec_format(tmp_file, binary=False)

To address the issue of having limited labeled data with positive class 1, I will augment this class by copying its existing data to increase the overall size of the labeled dataset. This will help improve the model's ability to capture the underlying patterns in the data. 

The augmented labeled data will be shuffled and be used to create the training set, consisting of X_train and y_train.

In [42]:
train_df = pd.concat([labeled_df.loc[labeled_df['label'] != 1], 
                      labeled_df.loc[labeled_df['label'] == 1], 
                      labeled_df.loc[labeled_df['label'] == 1]])\
             .sample(frac=1, random_state=42).reset_index(drop=True)

X_train, y_train = train_df['merged'].tolist(), train_df['label'].tolist()
print(len(X_train))

642


Then, we will retrieve a sample of the unlabeled data to be used as the test set for labeling.

In [21]:
i = 6 # i can be any integer in range(0, len(new)//500)
test_df = new.iloc[i*500: (i+1)*500]
X_test = test_df['merged']
print(len(X_test))

500


Now, I will define a function to create a list of sentence vectors, and then use it to convert the tweets in both X_train and X_test into vectors.

In [22]:
def embedding(X: list, glove: dict) -> list:
    """
    Iterate through each sentence of the input, extract word 
    vectors from the pre-trained GloVe model for each word, 
    and compute the mean of these vectors to obtain the sentence 
    vector. If no words are found in GloVe for a sentence, a zero 
    vector is used instead. Return a corpus of sentence vectors.
    """
    corpus = []
    for sentence in X:
        vectors = [glove[word] for word in sentence if word in glove]
        if vectors:
            sentence_vec = np.mean(vectors, axis=0)
        else:
            sentence_vec = np.zeros(glove.vector_size)
        corpus.append(sentence_vec)
    return corpus

In [43]:
X_test_vectors = embedding(X_test, glove)
X_train_vectors = embedding(X_train, glove)

## Spectral Clustering: Select cluster labels with the highest proportion of class 1

Spectral Clustering is powerful for capturing complex structures and patterns in data, including non-linear, non-convex, and irregularly shaped clusters. In addition, Spectral Clustering can effectively reduce the dimensionality of high-dimensional data and is robust to noise and initialization parameters. It is especially well-suited for text clustering, which often exhibits these characteristics. 

If you're interested in learning more about four popular clustering techniques from scratch, including K-means, K-means++, Spectral Clustering, and DBSCAN, as well as their respective advantages and limitations, and how to implement them using Python, please click the notebook: [Exploring Clustering Techniques from Scratch: K-means, K-means++, Spectral Clustering, and DBSCAN](https://github.com/persecond17/Black_Bear_CDFW2023/blob/main/social_media_NLP_project/step_3_data_transformation/Clustering_from_Scratch.ipynb).

Here, I am going to use it to train on the training set and identify clusters that have a high ratio of class 1 tweets.

In [24]:
def create_spectral_labels(X_train_vectors: list, glove: dict, k: int, train_df: pd.DataFrame):
    """
    Create and fit a Spectral Clustering object to the training set, 
    and cluster it into a specified number (k) of clusters. Add the 
    cluster labels as a column to the training dataframe, and return 
    the updated dataframe and the Spectral Clustering model.    
    """
    spectral = SpectralClustering(n_clusters=k, affinity='nearest_neighbors', 
                                  assign_labels='kmeans')
    spectral.fit(X_train_vectors)
    labels = spectral.fit_predict(X_train_vectors)
    train_df['cluster_label'] = labels
    return train_df, spectral

In [25]:
def find_best_fit_cluster(train_df: pd.DataFrame, k: int):
    """
    Calculate the ratio of class 1 in each cluster and choose the 
    top 3 clusters with the highest ratio. If the ratio of a cluster 
    in these 3 chosen clusters is higher than the threshold of 0.95, 
    it will be added to a list and returned.
    """
    cluster_df = train_df.groupby(train_df['cluster_label']).apply(lambda x: x)
    results = defaultdict(list)
    for cl in range(k):
        pos_num = cluster_df.loc[(cluster_df['cluster_label'] == cl) & (cluster_df['label'] == 1)]['label'].sum()
        pos_ratio = pos_num / cluster_df.loc[cluster_df['cluster_label']==cl]['label'].sum()
        results[cl] = [pos_num, round(pos_ratio, 2)]
    target = sorted(results.items(), key=lambda x: -x[1][1])[:3]
    chosen_cluster_labels = [cluster[0] for cluster in target if cluster[1][1] > .95] 
    return cluster_df, chosen_cluster_labels

Let's review the class 1 tweets in the chosen clusters to make sure it's quality and accuracy:

In [56]:
train_df, spectral = create_spectral_labels(X_train_vectors, glove, 10, train_df)
cluster_df, chosen_cluster_labels = find_best_fit_cluster(train_df, 10)

class_1_tweets = cluster_df.loc[(cluster_df['cluster_label'].isin(chosen_cluster_labels)) \
                                & (cluster_df['label'] == 1)]['content']
for s in class_1_tweets[:20]:
    print(s)

That is a beautiful bear...😎😎😎😎😎 https://t.co/Ucrw15Jp1F
@JessWaltonYR Bear Claw
That is a beautiful bear...😎😎😎😎😎 https://t.co/Ucrw15Jp1F
A bRuin bear. In its natural habitat. Before it is slaughtered https://t.co/V18i7tM6UP
pure_blitz mama bear with cub. #constellation #constellations #needlepushers #blitzicr #csun… https://t.co/vXIqhOx0BL
@diannaaboo That’s a bear
Who brought the bear
@Nicoo_m19 Bear
@Nicoo_m19 Bear
@diannaaboo That’s a bear
Beat the bear 😂☘️ https://t.co/xQpsr9VKqY
@POWERMAYO @honeychild1229 Bear ! 🐻💚 https://t.co/7i7Tz6UIaJ
A bRuin bear. In its natural habitat. Before it is slaughtered https://t.co/V18i7tM6UP
@twm1962 In with the bear on this🐻
I just saw a bear!
@puurrraatt Omg big bear 🥺💞
Beat the bear 😂☘️ https://t.co/xQpsr9VKqY
Just saw bare bear balls at #Comicon2018 #poolpanic Don't tell my mum plz.
@twm1962 In with the bear on this🐻
@princess_jem4 Bear!!!!


Upon analyzing the class 1 tweets in the selected clusters, it is evident that they are highly related to bear-encountering scenarios, indicating that our clustering algorithm has performed well.

With this successful outcome, we can proceed to use the identified cluster labels to cluster the unlabeled data for further analysis.

In [57]:
cluster_label = spectral.fit_predict(X_test_vectors)
test_df['cluster_label'] = cluster_label

## AdaBoost: Predict labels for tweets classified into the chosen clusters

AdaBoost (Adaptive Boosting) is a powerful ensemble learning algorithm that combines weak learners to form a strong predictor. It works by iteratively training weak models on subsets of the data, where each model focuses on the instances that were previously misclassified. 

AdaBoost can handle complex non-linear relationships in the data, robustness to overfitting, and good generalization performance. 

Here, I will be using Ada Boosting to predict labels for tweets that have been classified into the selected clusters by Spectral Clustering. By doing so, we can narrow down the range of tweets that need to be manually reviewed.

In [58]:
test_df = test_df.reset_index()
test_selected_clusters = test_df.loc[test_df['cluster_label'].isin(chosen_cluster_labels)]
test_vectors_selected_clusters = [X_test_vectors[i] for i in test_selected_clusters.index.tolist()]

In [59]:
dtc = DecisionTreeClassifier(max_depth=5, random_state=42)
ada_boost = AdaBoostClassifier(base_estimator=dtc, learning_rate=1, 
                              n_estimators=200, random_state=42)
ada_boost.fit(X_train_vectors, y_train)
y_test = ada_boost.predict(test_vectors_selected_clusters)

print(f"The number of predicted class 1 tweets was found to be: {dict(Counter(y_test))[1]}")

The number of predicted class 1 tweets was found to be: 11


## Manual review of Class 1 tweets and saving them to the labeled dataset

In [30]:
def add_new_labeled_tweets(manual_class_1_tweets: list, 
                           test_selected_clusters: pd.DataFrame, 
                           labeled_df: pd.DataFrame):
    """
    Reformat the new Class 1 tweets dataframe, 
    add it to the original labeled dataset.
    """
    new_labeled_df = test_selected_clusters.loc[manual_class_1_tweets]\
                                          .drop(columns=['index', 'cluster_label', 'ada_label'])
    new_labeled_df['label'] = len(new_labeled_df) * [1]
    labeled_df = pd.concat([labeled_df, new_labeled_df])
    return labeled_df

In [60]:
test_selected_clusters['ada_label'] = y_test
test_class_1_tweets = test_selected_clusters.loc[test_selected_clusters['ada_label']==1]['content'].tolist()
test_class_1_indexes = test_selected_clusters.loc[test_selected_clusters['ada_label']==1].index

for i in range(len(test_class_1_tweets)):
    print(test_class_1_indexes[i], test_class_1_tweets[i])

88 @PdCams @LivePDNation When they said Tahoe, we knew it would be a bear. 😎
138 @MeredithFrost @CanonUSAimaging @leica_camera @NikonUSA @500px @SonyUK @SonyAlpha Bear Lake, Rocky Mountain National Park; The Gamble House main entrance looking out; Nico with a filter. All but Bear Lake taken with my iPhone. https://t.co/4ANvywmj55
156 Late night boba runs with mama bear 🤗
191 Does a bear shit in the woods? https://t.co/kFf6lwQDyY
255 Does a bear shit in the woods? https://t.co/PpThTIjCiB
284 the 1 // blackbear
365 Hiked #SunsetTrail today in the sequoianationalforest . Saw a big #blackbear about 40 feet from us while we were walking back &amp; lived to tell about it. I’m here for a reason. Leggoooo. 🐻… https://t.co/SpXPt0VaIc
370 Monterey Update: Bull &amp; Bear is white washed because of car week
410 Blackbear really is the most slept on
417 ...mountains, they risk a deadly confrontation with the bear whose roar can shake the snow from the trees.” - #MTGM19 Player’s Guide. Art by @veli

In [61]:
manual_class_1_tweets = [88, 160, 191, 255, 284, 365, 410, 417]
labeled_df = add_new_labeled_tweets(manual_class_1_tweets, test_selected_clusters, labeled_df)
labeled_df['tweet_id'].count()

528

## Merge all steps together

In [65]:
def semi_supervised_learning(labeled_df: pd.DataFrame, new: pd.DataFrame, i: int, k: int, glove: dict):
    """
    augment class 1 records in the training set
    """
    # Define the training set
    train_df = pd.concat([labeled_df.loc[labeled_df['label'] != 1], 
                      labeled_df.loc[labeled_df['label'] == 1], 
                      labeled_df.loc[labeled_df['label'] == 1]])\
             .sample(frac=1, random_state=42).reset_index(drop=True)
    X_train, y_train = train_df['merged'].tolist(), train_df['label'].tolist()
    # Extract the test set
    test_df = new.iloc[i*500: (i+1)*500]
    X_test = test_df['merged']
    # Convert text data to vectors
    X_test_vectors = embedding(X_test, glove)
    X_train_vectors = embedding(X_train, glove)

    # Apply spectral Clustering on the training set 
    # and find the cluster with high ratio of Class 1 tweets
    train_df, spectral = create_spectral_labels(X_train_vectors, glove, k, train_df)
    cluster_df, chosen_cluster_labels = find_best_fit_cluster(train_df, k)
    # Perform spectral Clustering on the test set
    # and retrieve tweets in the chosen clusters
    cluster_label = spectral.fit_predict(X_test_vectors)
    test_df['cluster_label'] = cluster_label
    test_df = test_df.reset_index()
    test_selected_clusters = test_df.loc[test_df['cluster_label'].isin(chosen_cluster_labels)]
    test_vectors_selected_clusters = [X_test_vectors[i] for i in test_selected_clusters.index.tolist()]

    # Conduct AdaBoost on the retrieved tweets
    dtc = DecisionTreeClassifier(max_depth=5, random_state=42)
    ada_boost = AdaBoostClassifier(base_estimator=dtc, learning_rate=1, 
                                  n_estimators=200, random_state=42)
    ada_boost.fit(X_train_vectors, y_train)
    y_test = ada_boost.predict(test_vectors_selected_clusters)

    # Return Class 1 tweets in the chosen clusters for manually labeling
    test_selected_clusters['ada_label'] = y_test
    test_class_1_tweets = test_selected_clusters.loc[test_selected_clusters['ada_label']==1]['content'].tolist()
    test_class_1_indexes = test_selected_clusters.loc[test_selected_clusters['ada_label']==1].index

    return test_class_1_tweets, test_class_1_indexes, test_selected_clusters

## Let's apply the function on another set of unlabeled data:

In [66]:
test_class_1_tweets, test_class_1_indexes, test_selected_clusters = semi_supervised_learning(labeled_df, new, 12, 10, glove)
for i in range(len(test_class_1_tweets)):
    print(test_class_1_indexes[i], test_class_1_tweets[i])

13 @desterella Ima facetime you in the morning when she wake up &amp; mama bear at work lol
40 @i_am_BMW It’s only a 5ft bear
43 @MsAutumnWind I read “Brown Bear Brown Bear” and”Polar Bear Polar Bear” to my granddaughter rather than watch the second half🤮
69 @GeoHolms “I don’t speak bear!”
115 Brother bear is having a kickback and all I hear is bass against my wall. 🙃
179 “ I need a bear one that does AHHHHHH” 😂😂😂😂😂 https://t.co/Y6UMYn6abB
192 @robkahl The mama bear is Cardinals in this comp
213 @StormHuntley @ranee_11 BABY BEAR MADE IT AHHHH https://t.co/EgBi4WDGed
269 “@caylacruz: @NikkMode fuck, it must!!! Hahaha by a bear or something #raaaahrrrr” lmao @ raaaahrrr
285 Dinner with my Mar Bear!! @ Eat Chow https://t.co/Am24FgvNEt
288 @poetpooch @andrealori @AMHighcrest @HRHLou @joancbaez oh, wow, you are in bear country! Are these brown or black bears?
413 Also there’s a bear https://t.co/7ZmbRcQM5U


In [67]:
manual_class_1_tweets = [40, 213, 288, 413]
labeled_df = add_new_labeled_tweets(manual_class_1_tweets, test_selected_clusters, labeled_df)
labeled_df['tweet_id'].count()

532