In [1]:
import numpy as np
import pandas as pd
import os
import re
from nltk.stem import PorterStemmer

In [2]:
DATA_FOLDER = 'data/'
PREDICTIONS_FOLDER = os.path.join(DATA_FOLDER, 'predictions/')
all_data = pd.read_csv(os.path.join(DATA_FOLDER, 'all_data.csv'))
all_data

Unnamed: 0,web-scraper-order,text,section,topic,urls,userMentions,isReply,emojis
0,1604198576-694,Hello - yes to the above suggestions. Please p...,bugreports,Emails from Zooniverse projects now fail to ar...,https://status.zooniverse.org,,,
1,1604198689-999,"hey @team , why is there audio in the field g...",bugreports,audio in field guide,,@team,,
2,1604198607-767,"I should say, you have to be trying to go from...",bugreports,Frame 4 button,,,,
3,1604198528-563,I believe that there are some problems at the ...,bugreports,Possible level-up bug,,,,
4,1604198895-1630,Thank you for flagging this and your construct...,bugreports,Opting out of the level-up popups?,https://github.com/zooniverse/Panoptes-Front-E...,,,
...,...,...,...,...,...,...,...,...
90496,1604198211-259,"Scattered Light comes in many, many guises in ...",virgo,Virgo O3 glitch classes (placeholder),/projects/zooniverse/gravity-spy/talk/subjects...,,True,
90497,1604198222-261,"For proposing new Virgo glitch classes, please...",virgo,Process for Making New Virgo Classes Official,https://www.zooniverse.org/projects/zooniverse...,,,
90498,1604198211-260,Horizontal lines at medium to high frequencies...,virgo,Virgo O3 glitch classes (placeholder),/projects/zooniverse/gravity-spy/talk/subjects...,,True,
90499,1604198211-257,"On a related note, @dziakj1 has discovered (an...",virgo,Virgo O3 glitch classes (placeholder),/projects/zooniverse/gravity-spy/talk/subjects...,@dziakj1,True,


In [3]:
def get_predictions_agreement(df):
    file = open(os.path.join(DATA_FOLDER, 'agreement_words.txt'), 'r+')
    agreement_words = file.readlines()
    file.close()
    ps = PorterStemmer()
    return df['text'].apply(lambda text: int(any(ps.stem(re.sub(r'\n', '', word)) in ps.stem(re.sub(r'\#\S+|\n', '', text).lower()).split() for word in agreement_words)))

In [4]:
def get_predictions_vocatives(df):
    return 1 - df['userMentions'].isna().astype(int)

In [5]:
def get_predictions_group_inclusive_pronouns(df):
    PRONOUNS = ['everybody', 'everyone', 'our', 'ours', 'ourself', 'ourselves', 'us', 'we']
    return df['text'].apply(lambda text: int(any(word in re.sub(r'\#\S+|\n', '', text).lower().split() for word in PRONOUNS)))

In [6]:
def get_predictions_salutations(df):
    file = open(os.path.join(DATA_FOLDER, 'salutation_words.txt'), 'r+')
    salutation_words = file.readlines()
    file.close()
    return df['text'].apply(lambda text: int(any((re.sub(r'\n', '', word) in re.sub(r'\#\S+|\n', '', text).lower()) if len(word.split()) > 1 else (re.sub(r'\n', '', word) in re.sub(r'\#\S+|\n', '', text).lower().split()) for word in salutation_words)))

In [7]:
# category 4 - Continuing a thread
positives = all_data[all_data['isReply'] == 1]
print('Total instances', positives.shape[0])
print('Section distribution in positves', np.unique(positives['section'], return_counts=True))

Total instances 4869
Section distribution in positves (array(['bugreports', 'chat', 'collections', 'help', 'ligo', 'notes',
       'science', 'virgo'], dtype=object), array([ 105,  318,  103,  431,   46, 3621,  237,    8], dtype=int64))


In [8]:
# category 8 - Expressing agreement or disagreement
all_data['prediction'] = get_predictions_agreement(all_data)
positives = all_data[all_data['prediction'] == 1]
print('Total instances', positives.shape[0])
print('Section distribution in positves', np.unique(positives['section'], return_counts=True))
all_data.to_csv(os.path.join(PREDICTIONS_FOLDER, 'agreement_predicted.csv'), index=False)

Total instances 3874
Section distribution in positves (array(['bugreports', 'chat', 'collections', 'help', 'ligo', 'notes',
       'science', 'virgo'], dtype=object), array([  35,  185,   90,  171,   56, 3234,   99,    4], dtype=int64))


In [9]:
# category 9 - Vocatives
all_data['prediction'] = get_predictions_vocatives(all_data)
positives = all_data[all_data['prediction'] == 1]
print('Total instances', positives.shape[0])
print('Section distribution in positves', np.unique(positives['section'], return_counts=True))
all_data.to_csv(os.path.join(PREDICTIONS_FOLDER, 'vocatives_predicted.csv'), index=False)

Total instances 4413
Section distribution in positves (array(['bugreports', 'chat', 'collections', 'help', 'ligo', 'notes',
       'science', 'virgo'], dtype=object), array([ 117,  258,  105,  306,   94, 3384,  143,    6], dtype=int64))


In [10]:
# category 10 - Addresses or refers to the group using inclusive pronouns
all_data['prediction'] = get_predictions_group_inclusive_pronouns(all_data)
positives = all_data[all_data['prediction'] == 1]
print('Total instances', positives.shape[0])
print('Section distribution in positves', np.unique(positives['section'], return_counts=True))
all_data.to_csv(os.path.join(PREDICTIONS_FOLDER, 'group_predicted.csv'), index=False)

Total instances 1977
Section distribution in positves (array(['bugreports', 'chat', 'collections', 'help', 'ligo', 'notes',
       'science', 'virgo'], dtype=object), array([  54,  197,   70,  229,   42, 1208,  172,    5], dtype=int64))


In [11]:
# category 11 - Phatics, salutations
all_data['prediction'] = get_predictions_salutations(all_data)
positives = all_data[all_data['prediction'] == 1]
print('Total instances', positives.shape[0])
print('Section distribution in positves', np.unique(positives['section'], return_counts=True))
all_data.to_csv(os.path.join(PREDICTIONS_FOLDER, 'salutations_predicted.csv'), index=False)

Total instances 2283
Section distribution in positves (array(['bugreports', 'chat', 'collections', 'help', 'ligo', 'notes',
       'science', 'virgo'], dtype=object), array([  96,  176,   73,  259,   53, 1512,  107,    7], dtype=int64))
