In [2]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import Counter
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Clean Data

In [3]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def cleanData(messages):
    for i in range(0,len(messages)):
        try:
            filtered = re.sub(r'[\(, \)]', ' ', messages.iloc[i])
            filtered = re.sub(r'[\!]', '', filtered)
            filtered = re.sub(r'[^\w\s\!]', '', filtered)
            messages.iloc[i] = filtered
        except:
            messages.iloc[i] = ""
        
        if is_ascii(messages.iloc[i]) == False:
             messages.iloc[i] = ""

    return messages

# Word Importance

In [4]:
def wordImportance(messages):
    remove = set(stopwords.words('english'))
    tfIdfVectorizer=TfidfVectorizer(use_idf=True, stop_words = remove)
    tfIdf = tfIdfVectorizer.fit_transform(messages)
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    return df

def wordCount(word, words):
    
    return (list(words).count(word))

# Single Thread Analysis

In [5]:
#Load Data
data = pd.read_csv("Ansar1Clean.csv", index_col=0)

In [6]:
#Get thread
threads = data["ThreadID"].unique()
thread = data[data["ThreadID"]== threads[1]]
thread = thread.reset_index(drop = True)

#get all messages
messages = thread["Message"].copy()

#clean data
clean = cleanData(messages)

#get word importance
importance = wordImportance(clean)
importance[importance["TF-IDF"] != 0]

Unnamed: 0,TF-IDF
source,0.272208
httpwwwsendspacecomfilemydd4c,0.243838
language,0.243838
httprapidsharecomfiles22141498jihadpdfhtml,0.243838
httpwwwmediafirecomexjk5tcnnzt,0.243838
httpwwwmediafirecomnvwydgqu23k,0.243838
httpwwwmegauploadcomdkakolbzn,0.243838
httpwwwzsharenetdownload5865841133c35678,0.243838
httpwwwzsharenetdownload58658800d6f4a6c7,0.243838
httpwwwzsharenetdownload5865883315919906,0.243838


# Full Data Set Analysis

In [7]:
allMessages = data["Message"].copy()

#clean data
clean = cleanData(allMessages)

#get word importance
importance = wordImportance(clean)
importance.head(25)

Unnamed: 0,TF-IDF
042209,0.325056
spencer,0.312864
marine,0.217447
combat,0.198801
death,0.17357
honoluluadvertiser,0.171121
iraqmaking,0.171121
ray,0.162528
noncombatrelated,0.156432
chest,0.151703


In [8]:
importance[importance["TF-IDF"] != 0]
    

Unnamed: 0,TF-IDF
042209,0.325056
spencer,0.312864
marine,0.217447
combat,0.198801
death,0.17357
honoluluadvertiser,0.171121
iraqmaking,0.171121
ray,0.162528
noncombatrelated,0.156432
chest,0.151703


# Lexicon Building

In [9]:
#Load Data
data = pd.read_csv("Ansar1Clean.csv", index_col=0)
#Get thread
threads = data["ThreadID"].unique()

allWords = []
for i in threads:

    thread = data[data["ThreadID"]== i]
    thread = thread.reset_index(drop = True)

    #get all messages
    messages = thread["Message"].copy()

    #clean data
    clean = cleanData(messages)

    #get word importance
    try:
        importance = wordImportance(clean)
    except:
        pass
    
    toList = importance[importance["TF-IDF"] != 0].reset_index(level=0)["index"].to_list()
    if len(toList) != 0:
        allWords.append(toList)

In [10]:
words = np.concatenate(allWords)
uniqueWords = set(words)


In [11]:
count = []
for word in uniqueWords:
    count.append(wordCount(word,words))
    