## Imports

In [1]:
import os
from operator import add
from collections import Counter
import io
import numpy as np
import pandas as pd
import regex
import json

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import countDistinct, col, udf, stddev, avg


## Configs

In [11]:
PRODIGY_DATASET_PATH = 'data/prodigy/descriptions.jsonl'
TWITTER_PROFILE_PATHS = 'data/raw/*_user_info.txt'
IDENTITY_DICTIONARIES_PATH = 'data/identities/'
PRODIGY_PATTERN_FILE_PATH = 'data/prodigy/patterns.jsonl'

MIN_DESCRIPTION_LEN = 10
SAMPLE_SIZE = 1000

## Initialization

In [6]:
schema = StructType([
    StructField("uid",StringType(),True),
    StructField('name',StringType(),True),
    StructField("screen_name",StringType(),True),
    StructField('url',StringType(),True),
    StructField('protected',StringType(),True),
    StructField('location',StringType(),True),
    StructField('description',StringType(),True),
    StructField("followers_count",IntegerType(),True),
    StructField("friends_count",IntegerType(),True),
    StructField("created_at",StringType(),True),
    StructField("utc_offset",StringType(),True),
    StructField('time_zone',StringType(),True),
    StructField("statuses_count",IntegerType(),True),
    StructField("lang",StringType(),True),
    StructField("status_created_at",StringType(),True),
    StructField('status_coordinates',StringType(),True),
    StructField("status_lang",StringType(),True),
    StructField("profile_image_url_https",StringType(),True),
    StructField("verified",StringType(),True)
])

spark = SparkSession.builder.appName("spark-app").config("PYSPARK_PYTHON","python").getOrCreate()
profiles = spark.read.csv(TWITTER_PROFILE_PATHS, header=False, sep="\t", schema=schema)

profiles.show(3)

+------------------+----------------+-----------+----+---------+------------+--------------------+---------------+-------------+--------------------+----------+---------+--------------+----+--------------------+------------------+-----------+-----------------------+--------+
|               uid|            name|screen_name| url|protected|    location|         description|followers_count|friends_count|          created_at|utc_offset|time_zone|statuses_count|lang|   status_created_at|status_coordinates|status_lang|profile_image_url_https|verified|
+------------------+----------------+-----------+----+---------+------------+--------------------+---------------+-------------+--------------------+----------+---------+--------------+----+--------------------+------------------+-----------+-----------------------+--------+
|        2590887015|        Jennifer|    jp62783|None|    False|        null|                null|              5|           12|Sun Jun 08 01:41:...|      None|     None|  

## Filtering english and regex matches

In [7]:
def regex_filters(df, regex_list):
    for i, regex in enumerate(regex_list):
        df = df.filter(profiles.description.rlike(regex))
    return df

print("initial df size:", profiles.count())

eng_filtered = profiles.filter(
    (profiles.lang.startswith("en")) &
    (profiles.status_lang.startswith("en"))
)

print("after filtering english profiles:", eng_filtered.count())


initial df size: 1394484
after filtering english profiles: 408237


In [8]:
eng_filtered.filter(profiles.description.isNotNull()).count()

237482

In [10]:
%%time

regex_list = [
#     "((\w\s*)+[,|#/]\s*)",
#     "(i am a )|(I am a )|(I'm a )|(i'm a )|(i am an )|(I am an )|(I'm an )|(i'm an )",
]

regex_filtered = regex_filters(eng_filtered, regex_list)
regex_filtered = regex_filtered.filter(F.length(F.col('description')) > MIN_DESCRIPTION_LEN) 

print("after applying regex size:", regex_filtered.count())

regex_filtered.select('description').head(5)

after applying regex size: 219610
CPU times: user 7.9 ms, sys: 5.51 ms, total: 13.4 ms
Wall time: 1.44 s


[Row(description='teaching 21st century skills'),
 Row(description='God is able'),
 Row(description="I talk about my nipples a lot. I'm wildly attracted to JWoww. I use too much toilet paper. I love Lady Gaga. I lay on couches."),
 Row(description='follow me on instagram @cloudydeer'),
 Row(description="•Israeli Born & Raised•♋️ •Cali Livin'• ♛ •GoodVibes•")]

## Sample data

In [14]:
frac = SAMPLE_SIZE / regex_filtered.count() 
sampled_df = regex_filtered.sample(frac).select('description')
sampled_df.head(5)

[Row(description='mother and grandmother  enjoying life being together with family'),
 Row(description='K r i s t a N a w r o c k i*  #mother to a beautiful daughter named Briella'),
 Row(description='Central Arkansas Delta Zeta'),
 Row(description='god is my life ♏️ follow ig @nicole_goddess   snapchat dquazha smith'),
 Row(description='Insta TrippyJerm306')]

## Saving descriptions in prodigy format

In [15]:

descs = list(map(
    lambda x: x.description,
    sampled_df.collect()
))

with open(PRODIGY_DATASET_PATH, 'w') as f:
    for desc in descs:
        f.write(json.dumps({'text': desc}) + '\n')
        

## Extracting prodigy patterns from dictionaries

In [8]:

ignored_ids = ['person', 'god', 'a', 'fan']

def extract_prodigy_patterns(identities_dir):
    
    def build_pattern(words, label='IDENTITY'):
        return {
            'label': label,
            'pattern': [{'lower': _id_part.lower()} for _id_part in words.split(' ')]
        }
            
    patterns = []   
    for id_rel_path in os.listdir(identities_dir):
        
        id_path = os.path.join(identities_dir, id_rel_path)
        print('reading file:', id_path)
        
        with open(id_path, 'r') as f:
            
            ids = f.read().split('\n')
            for _id in ids:
                
                if len(_id) == 0 or _id in ignored_ids:
                    continue
                
                pattern = build_pattern(_id)
                patterns.append(json.dumps(pattern))
    return patterns
        
patterns = extract_prodigy_patterns(IDENTITY_DICTIONARIES_PATH)

with open(PRODIGY_PATTERN_FILE_PATH, 'w') as f:
    f.write('\n'.join(patterns))

reading file: data/identities/job_identities.txt
reading file: data/identities/national_identities.txt
reading file: data/identities/uga_identities.txt
reading file: data/identities/identities.txt
reading file: data/identities/racial_slur_identities.txt
reading file: data/identities/wordnet_identities.txt
reading file: data/identities/twitter_identities.txt
