In [1]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import nltk
from nltk.corpus import stopwords
import sage
from collections import Counter
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Clean Data

In [2]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def cleanData(messages):
    for i in range(0,len(messages)):
        try:
            filtered = re.sub(r'[\(, \)]', ' ', messages.iloc[i])
            filtered = re.sub(r'[\!]', '', filtered)
            filtered = re.sub(r'[^\w\s\!]', '', filtered)
            messages.iloc[i] = filtered
        except:
            messages.iloc[i] = ""
        
        if is_ascii(messages.iloc[i]) == False:
             messages.iloc[i] = ""

    return messages

# Word Importance

In [3]:
def wordImportance(messages):
    remove = set(stopwords.words('english'))
    tfIdfVectorizer=TfidfVectorizer(use_idf=True, stop_words = remove)
    tfIdf = tfIdfVectorizer.fit_transform(messages)
    df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
    df = df.sort_values('TF-IDF', ascending=False)
    return df

# Single Thread Analysis

In [4]:
#Load Data
data = pd.read_csv("Ansar1Clean.csv", index_col=0)

In [6]:
#Get thread
threads = data["ThreadID"].unique()
thread = data[data["ThreadID"]== threads[0]]
thread = thread.reset_index(drop = True)

#get all messages
messages = thread["Message"].copy()

#clean data
clean = cleanData(messages)

#get word importance
importance = wordImportance(clean)
importance.head(25)

Unnamed: 0,TF-IDF
042209,0.357461
spencer,0.331783
marine,0.179923
chest,0.178731
iraqmaking,0.178731
honoluluadvertiser,0.178731
fifth,0.178731
death,0.166177
ray,0.165892
noncombatrelated,0.156782


# Full Data Set Analysis

In [None]:
allMessages = data["Message"].copy()

#clean data
clean = cleanData(allMessages)

#get word importance
importance = wordImportance(clean)
importance.head(25)