# Find identity term matches in data

In [19]:
# Load incels data
import pandas as pd
import csv

path = '../../data/incels/all_comments.csv'
# data = pd.read_csv(path, engine='python', on_bad_lines=lambda x: print(x))
data = pd.read_csv(path, engine='python', on_bad_lines=lambda row: row[:-2].append(' '.join(row[-2:]))) # combine last 2 elements in a line mentioning Gulag
data['parsed_date'] = pd.to_datetime(data.date, errors='coerce')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6248230 entries, 0 to 6248229
Data columns (total 7 columns):
 #   Column       Dtype         
---  ------       -----         
 0   type         object        
 1   forum        object        
 2   thread       object        
 3   username     object        
 4   date         object        
 5   content      object        
 6   parsed_date  datetime64[ns]
dtypes: datetime64[ns](1), object(6)
memory usage: 333.7+ MB


## NetMapper identity term list

In [14]:
# Load identity terms
import pandas as pd

nm_identities_path = '../resources/generic_agents-identity_v15_2021_10_15.xlsx'
multi_identities = pd.read_excel(nm_identities_path)
multi_identities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19887 entries, 0 to 19886
Columns: 156 entries, Akan to Category 4
dtypes: float64(56), int64(1), object(99)
memory usage: 23.7+ MB


In [25]:
# Filter to English, remove duplicates
cols = multi_identities.columns.tolist()
identities = multi_identities[cols[cols.index('English'):]]
identities['term'] = identities['English'].str.lower()
identities.drop_duplicates(subset='term', inplace=True)
identities.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19035 entries, 0 to 19886
Data columns (total 44 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   English          19035 non-null  object 
 1   conceptTo        19035 non-null  object 
 2   metaOntology     19035 non-null  object 
 3   nodeType         0 non-null      float64
 4   Category 1       9703 non-null   object 
 5   Category 2       911 non-null    object 
 6   Category 3       18929 non-null  object 
 7   Country          4712 non-null   object 
 8   First Name       0 non-null      float64
 9   Last Name        0 non-null      float64
 10  Gender           759 non-null    object 
 11  Suffix           0 non-null      float64
 12  Language         0 non-null      float64
 13  Acronym          336 non-null    object 
 14  Valence          19035 non-null  object 
 15  Evaluation       19035 non-null  int64  
 16  Potency          10 non-null     float64
 17  Activity    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  identities['term'] = identities['English'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  identities.drop_duplicates(subset='term', inplace=True)


In [None]:
# Separate out stopwords
stops = ident
identities[identities['stop word']==1]['term'].tolist()

In [26]:
# Search for identity matches
import re

pats = [re.compile(r'\b{}\b'.format(re.escape(term.lower()))) for term in identities['English']]
len(pats)

19035

In [32]:
def contains_match(text):
    """ Return matched patterns present in text """ 
    return [re.search(p, str(text).lower()).group() for p in pats if re.search(p, str(text).lower()) is not None]

In [28]:
contains_match('i am british')

['i', 'british']

In [33]:
head = data.head(100)
head['nm_identity_matches'] = head.content.map(contains_match)
head[head.nm_identity_matches.str.len() > 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  head['nm_identity_matches'] = head.content.map(contains_match)


Unnamed: 0,type,forum,thread,username,date,content,parsed_date,nm_identity_matches
0,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,Transcended Trucel,"Nov 20, 2020","Indeed its very sad thing, but no choice for us.",2020-11-20,"[its, us]"
1,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,ItsNotADream,"Nov 20, 2020",to be fair all normies nowadays are on their i...,2020-11-20,[their]
2,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,AAAAAAAAAAAcel,"Nov 20, 2020","What would you prefer, staring at the wall all...",2020-11-20,"[you, nobody]"
3,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,Amerihiki,"Nov 20, 2020","I know how you feel; I lost interest in vidya,...",2020-11-20,"[i, you, other]"
4,COMMENT,001-MustReadContent,0000014-Itssosadthatwereplacesoc,Deleted member 7448,"Nov 20, 2020",ItsNotADream said: to be fair all normies nowa...,2020-11-20,"[i, i'm, it, it's, their, they, they're, other]"
...,...,...,...,...,...,...,...,...
94,COMMENT,001-MustReadContent,0000005-Baldingissoterrifyingtha,Irredeemable,"Sep 25, 2020",FullTimeLoser said: Balding can take a giga ch...,2020-09-25,"[it, it's]"
96,COMMENT,001-MustReadContent,0000005-Baldingissoterrifyingtha,Gymcelled,"Sep 25, 2020",FullTimeLoser said: Balding can take a giga ch...,2020-09-25,"[i, me, it, it's, you, him]"
97,COMMENT,001-MustReadContent,0000005-Baldingissoterrifyingtha,mänline,"Sep 25, 2020",Jude Law looked better with balding IMO. But m...,2020-09-25,[his]
98,COMMENT,001-MustReadContent,0000005-Baldingissoterrifyingtha,Gymcelled,"Sep 25, 2020",ReturnOfSaddam said: Legendary thread. Hair tr...,2020-09-25,"[i, it, it's, they, you, your, you're, male, d..."
