In [159]:
import re
import pandas as pd
from collections import defaultdict
import tldextract

Related to [T63](https://mattiasostmar.phacility.com/T63) in Phabricator 

The raw survey-file is first copied to data/interim/<name>_cleaned.txt.

That file is thereafter manually altered:

- Manually removed first 123 lines, where text was still not stored by script
- Manually removed 69 occurances of empty lines

This leaves 28 311 lines in survey_2018-01-23_cleaned.txt

In [152]:
# Replace unwanted semicolons
def replace_superflous_semicolons(line, line_no):
    
    # remove semicolons
    semicolons = 0
    new_line = ""
    for char in line:
        if char == ";":
            semicolons += 1
            
        if char == ";" and semicolons > 11: # the number of expected columns
            new_line += "<semic>"
        else:
            new_line += char
    
    if len(new_line.split(";")) < 11:
        return new_line.replace(";","<semic>")
    else:
        return new_line

def is_orphan_line(line):
    return not line.startswith("http")

def remove_newlines(line):
    return re.sub(r"[\n\r\n]"," ",line)


In [153]:
f_in = open("../data/interim/survey_2018-01-23_to_clean.txt")
f_out = open("../data/processed/survey_2018-01-23_cleaned.txt","w")
line_no = 0

data = {}

# ['url', 'typealyzer', 'actual', 'e', 's', 't', 'sntf_s', 'sntf_n', 'sntf_t', 'sntf_f', 'date', 'text']
cols = [col.rstrip() for col in f_in.readline().split(";")]
last_good_line = 1 # Not 0, since it's the columns line

for line in f_in:
    line_no += 1
   
    new_line = replace_superflous_semicolons(line, line_no)
    new_line = remove_newlines(new_line)
    
    if line_no > 0 and not is_orphan_line(new_line):
        last_good_line = line_no
        line_data = new_line.split(";")
        if not len(line_data) == 12:
            print("Oops, the line data has len() {}".format(len(line_data)))
            print("line_no: {}".format(line_no))
            print("new_line: {}".format(new_line))
            print("line_data: {}".format("\nline:".join(line_data)))
            break
        data[line_no] = defaultdict(dict)
        for col in cols:
            data[line_no][col] = None
            
        for (col,coldata) in zip(cols,new_line.split(";")):
            data[line_no][col] = coldata
        
    if is_orphan_line(new_line):
        #print("Suspected orphan line:\n{}".format(new_line))
        #print("Last good lines fields:\n{}".format(data[last_good_line]))
        new_line = new_line.strip("\n")
        padded_new_line = " " + new_line
        data[last_good_line]["text"] += padded_new_line.strip("\n")
        print("added line {}'s text to line {}".format(line_no, last_good_line))

columns_line = open("../data/interim/survey_2018-01-23_cleaned.txt").readline()
f_out.write(columns_line)
print("len(data): {}".format(len(data)))
#print("data[1]:\n{}".format(data[1]))
for row_nr in data:
    row_string = ""
    for key in cols:
        separated_data = data[row_nr][key] + ";"
        row_string += separated_data
    row_string = row_string.strip(";")
    if row_string.endswith("\n"):
        f_out.write(row_string)
    else:
        f_out.write(row_string + "\n")
f_out.close()
f_in.close()

added line 878's text to line 877
added line 879's text to line 877
added line 992's text to line 991
added line 993's text to line 991
added line 1077's text to line 1076
added line 1434's text to line 1433
added line 1620's text to line 1619
added line 1621's text to line 1619
added line 1634's text to line 1633
added line 1635's text to line 1633
added line 1636's text to line 1633
added line 1637's text to line 1633
added line 1638's text to line 1633
added line 1639's text to line 1633
added line 1640's text to line 1633
added line 1641's text to line 1633
added line 1642's text to line 1633
added line 1643's text to line 1633
added line 1644's text to line 1633
added line 1645's text to line 1633
added line 1646's text to line 1633
added line 1647's text to line 1633
added line 1648's text to line 1633
added line 1649's text to line 1633
added line 1998's text to line 1997
added line 1999's text to line 1997
added line 2001's text to line 2000
added line 2002's text to line 2000


## Load final cleaned survey-file to Pandas

In [154]:
f = open("../data/processed/survey_2018-01-23_final.txt")
df = pd.read_csv(f, sep=";")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27959 entries, 0 to 27958
Data columns (total 12 columns):
url           27959 non-null object
typealyzer    27959 non-null object
actual        27959 non-null object
e             27959 non-null float64
s             27959 non-null float64
t             27959 non-null float64
sntf_s        27959 non-null float64
sntf_n        27959 non-null float64
sntf_t        27959 non-null float64
sntf_f        27959 non-null float64
date          27959 non-null object
text          27959 non-null object
dtypes: float64(7), object(5)
memory usage: 2.6+ MB


In [157]:
df.head(3)

Unnamed: 0,url,typealyzer,actual,e,s,t,sntf_s,sntf_n,sntf_t,sntf_f,date,text
0,http://jonkagstrom.com,ISTP,INFJ,0.420758,0.651605,0.652214,0.512359,0.274234,0.134025,0.079382,20120828 09:08:55,Jon Kågström playchilla.com uclassify.com abou...
1,http://adropofcolour.tumblr.com,ISFP,INFJ,0.291281,0.787844,0.460961,0.663515,0.178565,0.069282,0.088638,20120828 08:08:11,❀*a drop of colour*❀ 1/39 next→ home ask past ...
2,http://godheadcomplex.tumblr.com,ESFP,INFP,0.883579,0.951693,0.238407,0.855921,0.046931,0.02185,0.075297,20120828 09:08:34,Neko cool kids can't die home family daveblog ...


# Extract domains using tldextract
See [tldextract](https://pypi.python.org/pypi/tldextract)

Note:
co.vu is a free domain name service

In [168]:
domains = []
for index, row in df.iterrows():
    ext = tldextract.extract(row.url)
    domains.append(ext.domain)
df["domain"] = pd.Series(domains)
df.domain.value_counts()

tumblr                24487
blogspot                590
wordpress               545
co                      285
twitter                 248
facebook                170
google                   63
livejournal              53
Tumblr                   29
weebly                   28
reddit                   27
okcupid                  21
TUMBLR                   20
fanfiction               20
intjforum                16
tumbr                    16
Twitter                  15
deviantart               15
personalitycafe          15
youtube                  14
typealyzer               13
dreamwidth               13
personalityjunkie        13
medium                   13
pointlesssites           12
pinterest                12
instagram                12
ovh                      12
fighunter                11
pastebin                 10
                      ...  
yunglean                  1
tvmoviechristmas          1
ducklings                 1
genealogy                 1
luteces             

In [170]:
df.to_pickle("../pickles/dataframe_survey_2018-01-23_final.pickle")

In [171]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27959 entries, 0 to 27958
Data columns (total 14 columns):
url           27959 non-null object
typealyzer    27959 non-null object
actual        27959 non-null object
e             27959 non-null float64
s             27959 non-null float64
t             27959 non-null float64
sntf_s        27959 non-null float64
sntf_n        27959 non-null float64
sntf_t        27959 non-null float64
sntf_f        27959 non-null float64
date          27959 non-null object
text          27959 non-null object
domains       27959 non-null object
domain        27959 non-null object
dtypes: float64(7), object(7)
memory usage: 3.0+ MB
