In [1]:
import re
import pandas as pd
from collections import defaultdict
import tldextract
from langdetect import detect
from langdetect import DetectorFactory
DetectorFactory.seed = 0 # To ensure reproducible language detection results

Related to [T63](https://mattiasostmar.phacility.com/T63) in Phabricator 

The raw survey-file is first copied to data/interim/<name>_cleaned.txt.

That file is thereafter manually altered:

- Manually removed first 123 lines, where text was still not stored by script
- Manually removed 69 occurances of empty lines

This leaves 28 311 lines in survey_2018-01-23_cleaned.txt

In [2]:
# Replace unwanted semicolons
def replace_superflous_semicolons(line, line_no):
    
    # remove semicolons
    semicolons = 0
    new_line = ""
    for char in line:
        if char == ";":
            semicolons += 1
            
        if char == ";" and semicolons > 11: # the number of expected columns
            new_line += "<semic>"
        else:
            new_line += char
    
    if len(new_line.split(";")) < 11:
        return new_line.replace(";","<semic>")
    else:
        return new_line

def is_orphan_line(line):
    return not line.startswith("http")

def remove_newlines(line):
    return re.sub(r"[\n\r\n]"," ",line)


In [3]:
f_in = open("../../data/interim/survey_2018-01-23_to_clean.txt")
f_out = open("../../data/interim/survey_2018-01-23_cleaned.txt","w")
line_no = 0

data = {}

# ['url', 'typealyzer', 'actual', 'e', 's', 't', 'sntf_s', 'sntf_n', 'sntf_t', 'sntf_f', 'date', 'text']
cols = [col.rstrip() for col in f_in.readline().split(";")]
last_good_line = 1 # Not 0, since it's the columns line

for line in f_in:
    line_no += 1
   
    new_line = replace_superflous_semicolons(line, line_no)
    new_line = remove_newlines(new_line)
    
    if line_no > 0 and not is_orphan_line(new_line):
        last_good_line = line_no
        line_data = new_line.split(";")
        if not len(line_data) == 12:
            print("Oops, the line data has len() {}".format(len(line_data)))
            print("line_no: {}".format(line_no))
            print("new_line: {}".format(new_line))
            print("line_data: {}".format("\nline:".join(line_data)))
            break
        data[line_no] = defaultdict(dict)
        for col in cols:
            data[line_no][col] = None
            
        for (col,coldata) in zip(cols, new_line.split(";")):
            data[line_no][col] = coldata
        
    if is_orphan_line(new_line):
        #print("Suspected orphan line:\n{}".format(new_line))
        #print("Last good lines fields:\n{}".format(data[last_good_line]))
        new_line = new_line.strip("\n")
        padded_new_line = " " + new_line
        data[last_good_line]["text"] += padded_new_line.strip("\n")
        #print("added line {}'s text to line {}".format(line_no, last_good_line))

print("len(data): {}".format(len(data)))
#print("data[1]:\n{}".format(data[1]))
for row_nr in data:
    row_string = ""
    for key in cols:
        separated_data = data[row_nr][key] + ";"
        row_string += separated_data
    row_string = row_string.strip(";")
    if row_string.endswith("\n"):
        f_out.write(row_string)
    else:
        f_out.write(row_string + "\n")
print("No of lines in orginal raw data: {}".format(line_no))
f_out.close()
f_in.close()

len(data): 27959
No of lines in orginal raw data: 28310


After cleaning rows containing semicolons etc we have 27 959 rows left.

## Load the above cleaned survey-file to Pandas

In [2]:
f = open("../../data/interim/survey_2018-01-23_cleaned.txt")
names = ['url', 'typealyzer', 'actual', 'e', 's', 't', 'sntf_s', 'sntf_n', 'sntf_t', 'sntf_f', 'date', 'text']
df = pd.read_csv(f, sep=";", names=names)
len(df)

27959

In [5]:
df.head(3)

Unnamed: 0,url,typealyzer,actual,e,s,t,sntf_s,sntf_n,sntf_t,sntf_f,date,text
0,http://jonkagstrom.com,ISTP,INFJ,0.420758,0.651605,0.652214,0.512359,0.274234,0.134025,0.079382,20120828 09:08:55,Jon Kågström playchilla.com uclassify.com abou...
1,http://adropofcolour.tumblr.com,ISFP,INFJ,0.291281,0.787844,0.460961,0.663515,0.178565,0.069282,0.088638,20120828 08:08:11,❀*a drop of colour*❀ 1/39 next→ home ask past ...
2,http://godheadcomplex.tumblr.com,ESFP,INFP,0.883579,0.951693,0.238407,0.855921,0.046931,0.02185,0.075297,20120828 09:08:34,Neko cool kids can't die home family daveblog ...


In [3]:
df.date.describe()

count                 27959
unique                16318
top       20140801 07:08:35
freq                     25
Name: date, dtype: object

# Add a column with crude count of tokens in the text column

In [4]:
tkn_s = df.text.str.split()

tkn_lens = []
for ix, l in tkn_s.iteritems():
    tkn_lens.append(len(l))
    
df["tokens"] = pd.Series(tkn_lens)

In [7]:
df[pd.isnull(df["tokens"])]

Unnamed: 0,url,typealyzer,actual,e,s,t,sntf_s,sntf_n,sntf_t,sntf_f,date,text,tokens,domain


# Extract domains using tldextract
See [tldextract](https://pypi.python.org/pypi/tldextract)

Note:
co.vu is a free domain name service

In [8]:
domains = []
for index, row in df.iterrows():
    ext = tldextract.extract(row.url)
    domains.append(ext.domain)
df["domain"] = pd.Series(domains)
df.domain.value_counts()

tumblr                    24487
blogspot                    590
wordpress                   545
co                          285
twitter                     248
facebook                    170
google                       63
livejournal                  53
Tumblr                       29
weebly                       28
reddit                       27
okcupid                      21
fanfiction                   20
TUMBLR                       20
tumbr                        16
intjforum                    16
Twitter                      15
personalitycafe              15
deviantart                   15
youtube                      14
personalityjunkie            13
dreamwidth                   13
medium                       13
typealyzer                   13
instagram                    12
ovh                          12
pointlesssites               12
pinterest                    12
fighunter                    11
pastebin                     10
                          ...  
susiemak

# Remove rows with un-translateable texts
187 rows canno't be language classified using [langdetect](https://github.com/Mimino666/langdetect), mostly due to non-existent texts. 

Others contains mostly symbols, e.g. "▲▼▲▼▲▼ ▲▼▲▼▲▼" or "----''. _ __ / .' /"

In [10]:
error_ix = []
langs = []
for ix, row in df.iterrows():
    try:
        langs.append(detect(row["text"]))
    except Exception as e:
        print("ix: {} error: {}\ntext{}".format(ix, e, row["text"]))
        error_ix.append(ix)
print("No of un-transtanslateable rows: {}".format(len(error_ix)))

ix: 414 error: No features in text.
text 
ix: 424 error: No features in text.
text/ ._> | | | '_>/ ._>| . \/ ._>| || |/ ._> | _/\___/`___||_| \___. |_| |_| \___.|___/\___.|_||_|\___. |_| -->  
ix: 825 error: No features in text.
text 
ix: 844 error: No features in text.
text 
ix: 1378 error: No features in text.
text 
ix: 1664 error: No features in text.
text 
ix: 2005 error: No features in text.
text 
ix: 2110 error: No features in text.
text 
ix: 2272 error: No features in text.
text 
ix: 2284 error: No features in text.
text 
ix: 2805 error: No features in text.
text 
ix: 2850 error: No features in text.
text 
ix: 2851 error: No features in text.
text 
ix: 2970 error: No features in text.
text 
ix: 2990 error: No features in text.
text 
ix: 3211 error: No features in text.
text= 
ix: 3309 error: No features in text.
text 
ix: 3743 error: No features in text.
text 
ix: 3825 error: No features in text.
text 
ix: 3843 error: No features in text.
text 
ix: 3886 error: No features in tex

ix: 24813 error: No features in text.
text 
ix: 24843 error: No features in text.
text 
ix: 24861 error: No features in text.
text 
ix: 25524 error: No features in text.
text 
ix: 25535 error: No features in text.
text 
ix: 26024 error: No features in text.
text 
ix: 26052 error: No features in text.
text 
ix: 26072 error: No features in text.
text 
ix: 26154 error: No features in text.
text 
ix: 26155 error: No features in text.
text 
ix: 26194 error: No features in text.
text 
ix: 26398 error: No features in text.
text 
ix: 26754 error: No features in text.
text 
ix: 27647 error: No features in text.
text 
ix: 27908 error: No features in text.
text 
No of un-transtanslateable rows: 187


In [29]:
len(error_ix)

187

In [22]:
df = df.drop(error_ix)
df.reset_index(drop=True, inplace=True)

We now have 27 772 rows left from the original raw data.

In [28]:
df.to_pickle("../../pickles/dataframe_survey_2018-01-23_cleaned.pickle")

In [25]:
df.to_csv("../../data/interim/dataframe_survey_2018-01-23_cleaned.csv", sep=";")