# Cluster document vectors of extracted features

## Load, extract vectors from DocuScope cluster counts

In [18]:
# Load category counts
import pandas as pd

csvpath = '/storage2/mamille3/data/hate_speech/degibert2019/docuscope_output/sentences-2022-02-18-131521/csv/CLUSTER_C_sentences.csv'
category_counts = pd.read_csv(csvpath, index_col=0)
category_counts.index = category_counts.index.str.slice(0,-4)
category_counts
len(category_counts.columns)

old_cols = category_counts.columns

# Drop categories that do not occur
category_counts = category_counts.loc[:, (category_counts != 0).any(axis=0)] # Is just one category
len(category_counts.columns)

# Show categories that didn't occur
new_cols = category_counts.columns
print(set(old_cols) - set(new_cols))

# Load sentence splits and annotations
annotations_fpath = '/storage2/mamille3/data/hate_speech/degibert2019/combined_data.csv'
annotations = pd.read_csv(annotations_fpath).sort_values(['comment_id', 'sentence_id']).set_index('file_id')
annotations

# Merge DocuScope output with labels, metadata
merged = pd.merge(annotations, category_counts, left_index=True, right_index=True)
merged

set()


Unnamed: 0,comment_id,sentence_id,text,user_id,subforum_id,num_contexts,label,Tokens,AcademicTerms,AcademicWritingMoves,...,Narrative,Negative,Positive,PublicTerms,Reasoning,Responsibility,Strategic,Uncertainty,Updates,Group
12834217_1,12834217,1,"As of March 13th , 2014 , the booklet had been...",572066,1346,0,noHate,18,0,0,...,2,0,0,1,0,0,0,0,0,
12834217_2,12834217,2,In order to help increase the booklets downloa...,572066,1346,0,noHate,36,1,0,...,1,0,0,2,1,0,1,0,0,
12834217_3,12834217,3,( Simply copy and paste the following text int...,572066,1346,0,noHate,16,1,0,...,0,0,0,1,0,0,0,0,0,
12834217_4,12834217,4,Click below for a FREE download of a colorfull...,572066,1346,0,hate,22,1,0,...,0,1,1,0,0,0,0,0,0,
12834217_5,12834217,5,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,572066,1346,0,noHate,22,0,0,...,0,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33677015_1,33677015,1,Apparently he came to the conclusion that his ...,572948,1388,0,noHate,25,0,0,...,2,2,0,0,2,0,1,0,0,
33677019_1,33677019,1,Wish we at least had a Marine Le Pen to vote f...,735154,1388,0,noHate,15,0,0,...,1,0,0,0,0,0,1,0,0,
33677019_2,33677019,2,Its like the choices are white genocide candid...,735154,1388,0,noHate,14,0,0,...,0,1,0,0,0,0,0,0,0,
33677053_1,33677053,1,Why White people used to say that sex was a si...,572266,1388,0,hate,35,2,0,...,4,1,0,0,0,0,0,1,0,


## Construct vectors
Could normalize counts, binarize, etc

In [19]:
# Binarize count vectors
for col in merged.columns[8:]:
    merged[f'binary_{col}'] = [min(x, 1) for x in merged[col]]
merged.loc[:, ['Tokens', 'AcademicTerms', 'binary_AcademicTerms']]

Unnamed: 0,Tokens,AcademicTerms,binary_AcademicTerms
12834217_1,18,0,0
12834217_2,36,1,1
12834217_3,16,1,1
12834217_4,22,1,1
12834217_5,22,0,0
...,...,...,...
33677015_1,25,0,0
33677019_1,15,0,0
33677019_2,14,0,0
33677053_1,35,2,1


In [3]:
# Normalize count vectors (just by token length, though could do the whole scaling thing to unit variance)
# Or could take the log or something so it's not such tiny fractions
for col in merged.columns[8:]:
    merged[f'mean_{col}'] = merged[col]/merged['Tokens']
merged.loc[:, ['Tokens', 'AcademicTerms', 'mean_AcademicTerms']]

Unnamed: 0,Tokens,AcademicTerms,mean_AcademicTerms
12834217_1,18,0,0.000000
12834217_2,36,1,0.027778
12834217_3,16,1,0.062500
12834217_4,22,1,0.045455
12834217_5,22,0,0.000000
...,...,...,...
33677015_1,25,0,0.000000
33677019_1,15,0,0.000000
33677019_2,14,0,0.000000
33677053_1,35,2,0.057143


In [20]:
# Extract normalized count vectors (or regular)
vectors = merged[[col for col in merged.columns if 'binary_' in col]].values
vectors.shape

(10913, 37)

In [21]:
# Check for, remove NaNs
import numpy as np

np.isnan(np.min(vectors))

processed = np.nan_to_num(vectors)
print(processed.shape)
np.isnan(np.min(processed))

(10913, 37)


False

In [22]:
from sklearn.decomposition import PCA

# pca = PCA(n_components=5) 45.2% variance explained
pca = PCA(n_components=.8, svd_solver='full')
reduced = pca.fit_transform(processed)
reduced.shape

(10913, 17)

In [23]:
print(sum(pca.explained_variance_ratio_))
pca.explained_variance_ratio_

0.8137618135637467


array([0.11831953, 0.05829824, 0.05811392, 0.05404993, 0.052656  ,
       0.05140439, 0.05067452, 0.04922939, 0.04715654, 0.04411398,
       0.04212656, 0.03620884, 0.03519375, 0.03325315, 0.03145872,
       0.02859679, 0.02290757])

In [24]:
pca.components_.shape

(17, 37)

In [25]:
# Get top DocuScope features for each component
def feats_for_factors(feature_names, pca, n_factors=20, n_feats=40):
    top = np.flip(np.argsort(pca.components_)[:n_factors, -1*n_feats:], axis=1)
    vec = np.vectorize(lambda x: feature_names[x])
    return vec(top)

feature_names = [col for col in merged.columns if 'binary_' in col]
topfeats = feats_for_factors(feature_names, pca, n_factors=11, n_feats=5)
topfeats

array([['binary_Narrative', 'binary_Description', 'binary_Character',
        'binary_InformationExposition', 'binary_Negative'],
       ['binary_Negative', 'binary_Interactive', 'binary_Character',
        'binary_ForceStressed', 'binary_AcademicTerms'],
       ['binary_Positive', 'binary_ForceStressed', 'binary_Interactive',
        'binary_FirstPerson', 'binary_AcademicTerms'],
       ['binary_Interactive', 'binary_InformationExposition',
        'binary_AcademicTerms', 'binary_MetadiscourseCohesive',
        'binary_Narrative'],
       ['binary_InformationExposition', 'binary_Description',
        'binary_AcademicTerms', 'binary_InformationStates',
        'binary_ForceStressed'],
       ['binary_AcademicTerms', 'binary_Interactive',
        'binary_InformationTopics', 'binary_Character',
        'binary_Positive'],
       ['binary_InformationExposition', 'binary_AcademicTerms',
        'binary_Narrative', 'binary_FirstPerson',
        'binary_InformationTopics'],
       ['binary_A

In [28]:
pd.DataFrame(topfeats).applymap(lambda x: x[7:])

Unnamed: 0,0,1,2,3,4
0,Narrative,Description,Character,InformationExposition,Negative
1,Negative,Interactive,Character,ForceStressed,AcademicTerms
2,Positive,ForceStressed,Interactive,FirstPerson,AcademicTerms
3,Interactive,InformationExposition,AcademicTerms,MetadiscourseCohesive,Narrative
4,InformationExposition,Description,AcademicTerms,InformationStates,ForceStressed
5,AcademicTerms,Interactive,InformationTopics,Character,Positive
6,InformationExposition,AcademicTerms,Narrative,FirstPerson,InformationTopics
7,AcademicTerms,Positive,Negative,InformationTopics,PublicTerms
8,InformationExposition,Positive,Negative,FirstPerson,Interactive
9,FirstPerson,AcademicTerms,MetadiscourseCohesive,Character,Negative


In [32]:
# Get top documents/factor
def examples_for_factors(data, desired_colname, reduced, n_factors=20, n_examples=20):
    """ Expects data to be a pandas dataframe. 
        Returns  n_factors x n_examples"""
    top = np.argsort(reduced, axis=0)[-1*n_examples:, :n_factors][::-1].T
    vec = np.vectorize(lambda x: data.iloc[x][desired_colname])
    return vec(top)

examples = examples_for_factors(merged, 'text', reduced, n_factors=11, n_examples=5)
pd.set_option('display.max_colwidth', None)
pd.DataFrame(examples)

Unnamed: 0,0,1,2,3,4
0,well there is a little history for this post so listen up if you dont mind. there was a group in this city called the 311 boys supposedly they were a racist bunch of kids but i never thought they were but through the newspaper and manipulation of sotries they were labled as racist. well my views arnt that well known at school but some do know and ive been labled a `` 311 '' boy. a group of about 10 or so blacks approached me before school. now im not some little guy im a decent build and really have no fear of anyone or anything at this point in my life. well anyway they came up to me and started asking me if i was 311 if i was a racist and all this crap. i denied mostly for my own safety at the time. well i talked to some people who asked me who i thoguht were friends but it turns out they werent. they went right around and told them some of my views and now they came up to me at lunch today and pushed me around a little i stood up to them and got up in their faces but finally a hall monitor came and broke it up. im just wondering whats going to happen in the next few days since im the only WP kid in my school and am standing alone on this issue. just decided to post this bit of information for you wall to read keep safe and keep smart .,"Home schooling if feesible since , they must not only master the social sciences etc ... but als be able to be profiecent in the arts including music theory.Social skills and graces along w/public speaking.Weapon , hunting , survival , building , self-defense , exsplosives/bomb making & deactivation , codes , langauges , counter-intelligence , interogation , re-programing/minds and above all else always behave as a little kid ! Gotchya-this can all be accomplished through father and son/daughter communication starting at a young age and progressing untill the late teen years following dads clues/guidance.Learned behavior-especially since children tend to emulate the role model and mold them 'selves accordingly to thier parent of whom they love very much and want to please .","The reason y i dropped out of school was because either , i drop out or i got kicked out. i always fought with the packi gang at my school. and i was never going to back down. in auto class i handed the n * igger a monkey wrench and told him the name suited him aswell. all the non-whites hated me because i wore a `` nazi punk '' and a `` white pride '' patch on my coat and backpack. a got in a fight with some black chick and i won. the next day her butchy sister caught up with me in the hall way and said that after school her and i where gonna fight. i told her `` no if u wanna fight , we 're gonna fight right here right now. not after school where u can gather the rest of u monkies up so it 's an uneven fight '' . she huffed and stormed away. - we neevr did get to fight the last straw was the first day of school i had a forgein substitute teacher. so i was talkin and and this guy was on my case about turning around and getting my work done. he had to ask me this a few times and then finially i stood up and said , `` look , i 'm white ur black , i should be telling u whut to do ! '' ohhh man , my guidance conslour did n't like that nor did my principal .... ( this was n't at school this happened a month or so ago but these people went to my school ) i had a n * igger try to rob me at knife point in the forest while his gf sat back and watched. my friend came by and jumped him and i personally went after the chick. she ran and i caught up to her and smashed her head into a tree. thats only some of my stories ....... i got a few more but i dont have the time","why do you white nationalist try to spread fear in blacks saying that the jews do not care for them. jews are the reason affirmative action is in place which allows us to take your jobs. jews are the reason so many blacks have become rich including myself. so why do you think their just using us ? jews are your enemy and so are we so what you feel you need to do is make your enemy against each other. is that right. you want blacks against the jews. well guess what , not goinna happen. the jews needs blacks as much as blacks need the jews and together they are a force to be recon with .","WTF Well done liberals you have ethnically cleansed a 20,000 + yr old race of beautiful and unique white people from their own country This is a sick and vile crime I 'm crying when i look at those pictures i do n't care if you laugh and say it 's gay to cry because my people will no longer exist in 100yrs 20.000 yrs destroyed in 100 My beautiful race dying ! i do n't know what to do"
1,It is amazing that you can create an extremely sexist idea and assume that your idea is just sexist enough to be not be called on its sexism by feminists .,"First they give us sars then they are standing by watching the law get broken , WTF !?","Personally I think the Nordic royalities should marry other Nordic ( or Germanic ) people , although I did n't watch it on TV or do I have nothing against Mary or Australians .","If a 10th grade teacher says they 're grading from a rubric that says they 're grading based on grammar , spelling , sentence structure , and format only , they ca n't , in most cases , turn around and give you a failing grade for the assignment because they do n't like the content .",So now Stormfront Ireland is solely responsible for all racist attacks in Ireland. lol Ted is right this is the work of the Kikes those * * * * ers are never happy .
2,Just take a few correspondance courses if you can you can likely make up the credits .,"Hey, just looking for some fellow brothers or sister to meet with , im starving for information and Possible Klans to join. would like to be able to meet with people around my age but also older and helpful",That 's great for our Germanic youth but if you want to include all of our youth thats cutting it a bit short is n't it ?,"I have to admit I do n't think I 've ever met a Briton ??? and I am English , so you must know something that I do n't know ??? , good for you .",This is a special thanks to all those who had the patience to wait it out while I constructed this article .
3,"I wasnt going to bother with this thread but I am rapidly coming to the conclusion you can , t read we are NOT C18 , we have nothing to do with them ...... look at post #13 for Christ sake",You wo n't see any of them in a thread like this who do n't come just to take a cheap shot at us via strawman arguments or ad hominems .,"That's vile , but I ca n't say it 's a big shock , their culture seems to glorify that sort of behaviour ...","FreedomRanger-I live out in CA now but was actually going to West Virginia sometime probably too ( hopefully soon but by the end of this next year almost 100 % sure ) , the charleston area , send me a pm ? sikgirl88-I live a city over ... san bernardino ( -. - heard it enough yet ? ) , send me a pm ?","at least this is been reported , i wonder how many cases like this one happen in USA , England etc and are never reported !"
4,"Indeed, the arab population of interior BC is quite high and in some of the richer areas are almost 60 % + brown .",Nordic is a sub race that an Anglo Saxon or Celtic person could and more than likely does belong to .,"Behind every avatar is an actual person , and we can even meet .","Found along Orion 's sword just north of the bright Orion Nebula complex , these reflection nebulae are also associated with Orion 's giant molecular cloud about 1,500 light-years away , but are dominated by the characteristic blue color of interstellar dust reflecting light from hot young stars .",23 - White Supremacist - The 23rd letter of the alphabet is W. ROA - Race Over All - Rasa iznad svega SWP - Supreme White Power - Nadmoćna Bela Sila 100 % - 100 % White - 100 posto Belac-beo .
5,You can still bring that to the attention of local media and civil society groups .,The ' Civilization ' series narrated by Kenneth Clark is free on Youtube .,"Not that it matters much cause you will still be pushed up against those wonderful minorities everywhere you go , and our children will go to schools where they are in the majority .",it looks like jews have taken over ukar.org but you can read the section of the website dealing with the kosher tax on archive.org : http://web.archive.org/web/200602110...r.org/tax.html,LMAO.. this is funny .. i dunno if they 'd pay attention to ya though .. are the WWF board all jewish ? lolz
6,"My daughter is doing High School on line , she may go back to school regularly for her senior year , but this is in Colorado and I think it covers all classes .","When I go to just about any Australian city I feel like a foreigner , these cities were built by my own flesh and blood , I should not feel like an alien in them .",I'm pretty sure the average black is stronger then those white men that date asians or the ones who let their wives take control of the household .,im from wilbraham myself i 'm looking to organize a group in the area anyone who 's interested send me a PM or instant message me on theman2089 lets get * * * * in ' organized,There is more hope in Austria than normal Greece but I know one guy from it so I can post the news on Monday .
7,With the liberal media and police departments under reporting black crime they have been given a pass on their ghetto ways so many times that it as becomes acceptable behavior for them as there is no real consequences for it .,"Damn affirmative action they got the white house , now our beer halls , their is only one safe place left ... God 's save Iceland .",Leaked phone call shows Newly appointed Jewish government in Ukraine behind sniper attacks on protesters and police | European Daily News.org Thank god for the internet .,Whites wo n't exist in Canada anymore they all have left or mixed themselves away basically leaving Canada to become a large ghetto,"It is a silly video , but it highlights the nonsensical world in which Jesus lived and the world Jesus was instrumental in changing ."
8,sounds like a good thing ....... I 'll pitch in for food .... but if we meet in a park or something like that .. somebody please .... do n't forget the beer permit ...... lol,If I feel threatened especially by these fools you can bet I 'll be just as quick to kill as they are .,"I love them to death and would kill anyone that threatened them , but they will be hell when they get older ...","Lol I only listened to Bulls on Parade cause it gave me that angry rush I like from music , but now I 'll stop listening to that trash .","very interesting video , although I spotted at least one error ; Samhain is celebrated from sunset on October 31st to sunset on November 1st ."
9,"Sometimes I wonder if it is Because Whites are so much smarter that they find it harder to run together , as opposed to , independently .",I have and always will fly my rebel flag in my front yard for all to see.I also have it on my truck .,"( Includes : one pair of baggy pants , one pistol , a set of golden grills , a looting guide titled ' But I dendu nuffin ' , and one race card )",I have seen couple Chinese and Japanese tourists but I dont hate you .,Although I have English and Germans in my family as well .
