In [None]:
import re
import string
import pandas as pd

In [None]:
demojis = pd.read_csv('./data/deutsche-bahn/demojis.tsv', header=None, sep='\t')
demojis = demojis.set_index(0).to_dict()[1]

In [21]:
df = pd.read_csv('./data/deutsche-bahn/train_v1.4.tsv', header=None, sep='\t')
df = df[pd.notnull(df[1])]
df.head()

Unnamed: 0,0,1
0,neutral,"@DB_Bahn ja, weil in Wuppertal Bauarbeiten sin..."
1,positive,@nordschaf theoretisch kannste dir überall im ...
2,negative,Bahn verspätet sich..gleich kommt noch jemand ...
3,neutral,Ihre Anfragen brachten uns zu neuen Leistungen...
4,neutral,Kann ich mit dem DB Geschenk Ticket den ICE Sp...


In [22]:
lookup = {'neutral': 2,
          'positive': 1,
          'negative': 0}

sentiments = df[0].tolist()
sentiments = [lookup[sentiment] for sentiment in sentiments]
df[0] = sentiments

In [23]:
lines = df[1].tolist()

In [24]:
# clean dirty
_re1 = re.compile(r'[¼✔•●�©®☮\|\—\–^><➱←↑→↓►▶️ॐ~]|(m⊃2)', re.UNICODE)
for idx, line in enumerate(lines):
    try:
        lines[idx] = _re1.sub('', line)
    except:
        lines[idx]  = ''

In [25]:
# standardaise quotations
_re1 = re.compile(r'“|„', re.UNICODE)
_re2 = re.compile(r'”', re.UNICODE)
_re3 = re.compile(r'‚/w+‘', re.UNICODE)

for idx, line in enumerate(lines):
    line = _re1.sub(' « ', line)
    line = _re2.sub(' » ', line)
    search = _re3.findall(line)
    if search:
        std = ' '.join([' ‹', search[0][1:-1], '› '])
        lines[idx] = _re3.sub(std, line)
    else:
        lines[idx] = line

In [26]:
# lower-case all upper-case words

for idx, line in enumerate(lines):
    words = line.split(' ')
    lines[idx] = ' '.join([word.lower() if word.isupper() \
                            else word for word in words])

In [27]:
for idx, line in enumerate(lines):
    line = re.sub(r'\?+', '?', line, re.UNICODE)
    line = re.sub(r'\!+', '!', line, re.UNICODE)
    line = re.sub(r'\,+', ',', line, re.UNICODE)
    line = re.sub(r'\*+', '*', line, re.UNICODE)
    line = re.sub(r'\.[\.]+', ' … ', line, re.UNICODE)
    lines[idx] = re.sub(r'\-+', '-', line, re.UNICODE)

In [28]:
# drop hyphes and asterisks not between characters
_re1 = re.compile(r'(?<!\w)\*+(?!\w)|(?<!\w)\-+(?!\w)', re.UNICODE)
for idx, line in enumerate(lines):
    lines[idx] = _re1.sub('', line)

In [29]:
_re1 = re.compile(r'\&', re.UNICODE)
for idx, line in enumerate(lines):
    lines[idx] = _re1.sub(' and ', line)

In [30]:
for idx, line in enumerate(lines):
    line = re.sub(r'\-[\-]+', ' ', line, re.UNICODE)
    line = re.sub(r'\+[\+]+', ' ', line, re.UNICODE)
    line = re.sub(r'\_[\_]+', ' ', line, re.UNICODE)
    line = re.sub(r'\=[\=]+', ' ', line, re.UNICODE)
    lines[idx] = re.sub(r'\…[\…]+', ' ', line, re.UNICODE)

In [31]:
_re1 = re.compile(r'https?:\/\/(?:ww[w12]\.|(?!ww[w12]))(.*?)(?=\s|$)', re.UNICODE)
for idx, line in enumerate(lines):
    lines[idx] = _re1.sub(' <url> ', line)

In [32]:
_re1 = re.compile(r'@DB_Bahn|@db_bahn|@DB_bahn', re.UNICODE)
for idx, line in enumerate(lines):
    lines[idx] = _re1.sub(' <dbahn> ', line)

_re2 = re.compile(r'@[A-Za-z0-9_-]+(?=\s|\:|\.|\)|$)', re.UNICODE)
for idx, line in enumerate(lines):
    lines[idx] = _re2.sub(' @mention ', line)
    
_re3 = re.compile(r'[\w_\.]+@[\w_-]*.[\w]+', re.UNICODE)
for idx, line in enumerate(lines):
    lines[idx] = _re2.sub(' <email> ', line)

In [33]:
demoticons = {
':)': 'leicht lächelndes Gesicht' ,
':(': 'betrübtes Gesicht',
';)': 'zwinkerndes Gesicht',
':/': 'verwundertes Gesicht',
':D': 'grinsendes Gesicht mit großen Augen',
':P': 'sich die Lippen leckendes Gesicht',
';D': ' grinsendes Gesicht mit lachenden Augen',
'XD': 'grinsendes Gesicht mit zusammengekniffenen Augen',
'( ͡° ͜ʖ ͡°)': 'selbstgefällig grinsendes Gesicht',
'❤' : 'rotes Herz',
'✌': 'Victory-Geste'
}

demojis = {**demojis, **demoticons}

In [34]:
for idx, line in enumerate(lines):
    line = re.sub(r'\:\-?\)', f"<e> {demojis[':)']} </e>", line, re.UNICODE)
    line = re.sub(r'\:\-?\(', f"<e> {demojis[':(']} </e>", line, re.UNICODE)
    line = re.sub(r'\;\-?[\\\/]', f"<e> {demojis[':/']} </e>", line, re.UNICODE)
    line = re.sub(r'\;\-?\)', f"<e> {demojis[';)']} </e>", line, re.UNICODE)
    line = re.sub(r'\:\-?D', f"<e> {demojis[':D']} </e>", line, re.UNICODE)
    line = re.sub(r'\:\-?[Pp]', f"<e> {demojis[':P']} </e>", line, re.UNICODE)
    line = re.sub(r'\;\-?D', f"<e> {demojis[';D']} </e>", line, re.UNICODE)
    line = re.sub(r'[xX]D', f"<e> {demojis['XD']} </e>", line, re.UNICODE)
    line = re.sub(r'( ͡° ͜ʖ ͡°)', f"<e> {demojis['( ͡° ͜ʖ ͡°)']} </e>", line, re.UNICODE)
    lines[idx] = line

In [35]:
for idx, line in enumerate(lines):
    line = re.sub(r'<url>\s*<url>+', ' <url> ', line, re.UNICODE)
    line = re.sub(r'@mention\s*@mention+', ' @mention ', line, re.UNICODE)
    line = re.sub(r'<email>\s*<email>', ' <email> ', line, re.UNICODE)
    lines[idx] = re.sub(r'\s\s+', ' ', line, re.UNICODE)

In [36]:
for idx, line in enumerate(lines):
    lines[idx] = ' '.join([f'<e> {demojis[word]} </e>' if word in list(demojis.keys()) else word for word in line.split()])

In [37]:
df[1] = lines
df.head()

Unnamed: 0,0,1
0,2,"<dbahn> ja, weil in Wuppertal Bauarbeiten sind..."
1,1,<email> theoretisch kannste dir überall im Köl...
2,0,Bahn verspätet sich … gleich kommt noch jemand...
3,2,Ihre Anfragen brachten uns zu neuen Leistungen...
4,2,Kann ich mit dem db Geschenk Ticket den ice Sp...


In [38]:
df.to_csv('train.csv', header=False, index=False)