# Cluster document vectors of extracted features

## Load, extract vectors from DocuScope cluster counts

In [1]:
# Load category counts
import pandas as pd

csvpath = '/storage2/mamille3/data/hate_speech/degibert2019/docuscope_output/sentences-2022-02-18-131521/csv/CLUSTER_C_sentences.csv'
category_counts = pd.read_csv(csvpath, index_col=0)
category_counts.index = category_counts.index.str.slice(0,-4)
category_counts
len(category_counts.columns)

old_cols = category_counts.columns

# Drop categories that do not occur
category_counts = category_counts.loc[:, (category_counts != 0).any(axis=0)] # Is just one category
len(category_counts.columns)

# Show categories that didn't occur
new_cols = category_counts.columns
print(set(old_cols) - set(new_cols))

# Load sentence splits and annotations
annotations_fpath = '/storage2/mamille3/data/hate_speech/degibert2019/combined_data.csv'
annotations = pd.read_csv(annotations_fpath).sort_values(['comment_id', 'sentence_id']).set_index('file_id')
annotations

# Merge DocuScope output with labels, metadata
merged = pd.merge(annotations, category_counts, left_index=True, right_index=True)
merged

set()


Unnamed: 0,comment_id,sentence_id,text,user_id,subforum_id,num_contexts,label,Tokens,AcademicTerms,AcademicWritingMoves,...,Narrative,Negative,Positive,PublicTerms,Reasoning,Responsibility,Strategic,Uncertainty,Updates,Group
12834217_1,12834217,1,"As of March 13th , 2014 , the booklet had been...",572066,1346,0,noHate,18,0,0,...,2,0,0,1,0,0,0,0,0,
12834217_2,12834217,2,In order to help increase the booklets downloa...,572066,1346,0,noHate,36,1,0,...,1,0,0,2,1,0,1,0,0,
12834217_3,12834217,3,( Simply copy and paste the following text int...,572066,1346,0,noHate,16,1,0,...,0,0,0,1,0,0,0,0,0,
12834217_4,12834217,4,Click below for a FREE download of a colorfull...,572066,1346,0,hate,22,1,0,...,0,1,1,0,0,0,0,0,0,
12834217_5,12834217,5,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,572066,1346,0,noHate,22,0,0,...,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33677015_1,33677015,1,Apparently he came to the conclusion that his ...,572948,1388,0,noHate,25,0,0,...,2,2,0,0,2,0,1,0,0,
33677019_1,33677019,1,Wish we at least had a Marine Le Pen to vote f...,735154,1388,0,noHate,15,0,0,...,1,0,0,0,0,0,1,0,0,
33677019_2,33677019,2,Its like the choices are white genocide candid...,735154,1388,0,noHate,14,0,0,...,0,1,0,0,0,0,0,0,0,
33677053_1,33677053,1,Why White people used to say that sex was a si...,572266,1388,0,hate,35,2,0,...,4,1,0,0,0,0,0,1,0,


In [2]:
# Normalize count vectors (just by token length, though could do the whole scaling thing to unit variance)
# Or could take the log or something so it's not such tiny fractions
for col in merged.columns[8:]:
    merged[f'mean_{col}'] = merged[col]/merged['Tokens']
merged.loc[:, ['Tokens', 'AcademicTerms', 'mean_AcademicTerms']]

Unnamed: 0,Tokens,AcademicTerms,mean_AcademicTerms
12834217_1,18,0,0.000000
12834217_2,36,1,0.027778
12834217_3,16,1,0.062500
12834217_4,22,1,0.045455
12834217_5,22,0,0.000000
...,...,...,...
33677015_1,25,0,0.000000
33677019_1,15,0,0.000000
33677019_2,14,0,0.000000
33677053_1,35,2,0.057143


In [3]:
# Extract normalized count vectors (or regular)
vectors = merged[[col for col in merged.columns if 'mean_' in col]].values
vectors.shape

(10913, 37)

In [4]:
# Check for NaNs
import numpy as np

np.isnan(np.min(vectors))

True

In [5]:
processed = np.nan_to_num(vectors)
print(processed.shape)
np.isnan(np.min(processed))

(10913, 37)


False

In [6]:
from sklearn.decomposition import PCA

# pca = PCA(n_components=5) 45.2% variance explained
pca = PCA(n_components=.8, svd_solver='full')
reduced = pca.fit_transform(processed)
reduced.shape

(10913, 11)

In [7]:
print(sum(pca.explained_variance_ratio_))
pca.explained_variance_ratio_

0.8264181637601749


array([0.16080998, 0.11603921, 0.0897078 , 0.08728835, 0.07682701,
       0.0744396 , 0.06902329, 0.05452897, 0.04026241, 0.02936255,
       0.02812899])

In [8]:
pca.components_.shape

(11, 37)

In [9]:
# Get top DocuScope features for each component
def feats_for_factors(feature_names, pca, n_factors=20, n_feats=40):
    top = np.flip(np.argsort(pca.components_)[:n_factors, -1*n_feats:], axis=1)
    vec = np.vectorize(lambda x: feature_names[x])
    return vec(top)

In [10]:
feature_names = [col for col in merged.columns if 'mean_' in col]
feature_names

['mean_AcademicTerms',
 'mean_AcademicWritingMoves',
 'mean_Character',
 'mean_Citation',
 'mean_CitationAuthority',
 'mean_CitationHedged',
 'mean_ConfidenceHedged',
 'mean_ConfidenceHigh',
 'mean_ConfidenceLow',
 'mean_Contingent',
 'mean_Description',
 'mean_Facilitate',
 'mean_FirstPerson',
 'mean_ForceStressed',
 'mean_Future',
 'mean_InformationChange',
 'mean_InformationChangeNegative',
 'mean_InformationChangePositive',
 'mean_InformationExposition',
 'mean_InformationPlace',
 'mean_InformationReportVerbs',
 'mean_InformationStates',
 'mean_InformationTopics',
 'mean_Inquiry',
 'mean_Interactive',
 'mean_MetadiscourseCohesive',
 'mean_MetadiscourseInteractive',
 'mean_Narrative',
 'mean_Negative',
 'mean_Positive',
 'mean_PublicTerms',
 'mean_Reasoning',
 'mean_Responsibility',
 'mean_Strategic',
 'mean_Uncertainty',
 'mean_Updates',
 'mean_Group']

In [11]:
topfeats = feats_for_factors(feature_names, pca, n_factors=11, n_feats=5)
topfeats

array([['mean_Character', 'mean_InformationStates', 'mean_Group',
        'mean_CitationHedged', 'mean_ConfidenceLow'],
       ['mean_Description', 'mean_Character',
        'mean_InformationExposition', 'mean_FirstPerson',
        'mean_InformationChange'],
       ['mean_Positive', 'mean_ForceStressed', 'mean_Description',
        'mean_Character', 'mean_MetadiscourseInteractive'],
       ['mean_Interactive', 'mean_Description', 'mean_Positive',
        'mean_ForceStressed', 'mean_Character'],
       ['mean_Negative', 'mean_ForceStressed', 'mean_AcademicTerms',
        'mean_MetadiscourseCohesive', 'mean_InformationStates'],
       ['mean_InformationExposition', 'mean_ForceStressed',
        'mean_AcademicTerms', 'mean_Strategic', 'mean_InformationStates'],
       ['mean_ForceStressed', 'mean_Narrative', 'mean_Interactive',
        'mean_Character', 'mean_FirstPerson'],
       ['mean_AcademicTerms', 'mean_InformationTopics',
        'mean_PublicTerms', 'mean_InformationPlace', 'mean_F

In [12]:
pd.DataFrame(topfeats)

Unnamed: 0,0,1,2,3,4
0,mean_Character,mean_InformationStates,mean_Group,mean_CitationHedged,mean_ConfidenceLow
1,mean_Description,mean_Character,mean_InformationExposition,mean_FirstPerson,mean_InformationChange
2,mean_Positive,mean_ForceStressed,mean_Description,mean_Character,mean_MetadiscourseInteractive
3,mean_Interactive,mean_Description,mean_Positive,mean_ForceStressed,mean_Character
4,mean_Negative,mean_ForceStressed,mean_AcademicTerms,mean_MetadiscourseCohesive,mean_InformationStates
5,mean_InformationExposition,mean_ForceStressed,mean_AcademicTerms,mean_Strategic,mean_InformationStates
6,mean_ForceStressed,mean_Narrative,mean_Interactive,mean_Character,mean_FirstPerson
7,mean_AcademicTerms,mean_InformationTopics,mean_PublicTerms,mean_InformationPlace,mean_FirstPerson
8,mean_AcademicTerms,mean_Narrative,mean_Negative,mean_Positive,mean_Description
9,mean_InformationTopics,mean_PublicTerms,mean_Narrative,mean_InformationPlace,mean_ForceStressed
