In [4]:
from xml.dom import minidom
from BeautifulSoup import BeautifulSoup
from nltk.tokenize import TweetTokenizer
from os import walk
import pyprind
import re
import pandas as pd
import pickle

** 1. Učitavanje svih XML-a i spremanje u mapu sa parovima [user_id::int, tweets::list(String))] **

In [5]:
mapa = {}
mypath = 'input'


for (dirpath, dirnames, filenames) in walk(mypath):
    pbar = pyprind.ProgBar(len(filenames))
    for f in filenames:
        fsp = f.split(".")
        if fsp[-1] == 'xml':
            xmldoc = minidom.parse(mypath + "/" + f)
            tweets = []
            for doc in xmldoc.getElementsByTagName('document'):
                try:
                    tweets.append(doc.firstChild.wholeText)
                    br_tweets += 1
                except:
                    pass
            mapa[fsp[0]] = tweets
        pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:15


** 2. Provjera mape **

In [6]:
df = pd.DataFrame()
no_of_empty = 0

for key in mapa:
    if len(mapa[key]) == 0:
        no_of_empty += 1
    else:
        df = df.append([[key, mapa[key][:3]]], ignore_index=True)

df.columns = ['user_id', 'tweets']

print "Broj praznih entry-a!! : ", no_of_empty, "/", len(mapa)
df.head(10)

Broj praznih entry-a!! :  15 / 436


Unnamed: 0,user_id,tweets
0,f677a164b16096fc2eb3af86694da8ac,[Flashback: WSJ (7/28/11): The Road to a Downg...
1,15bfa35315fb53d6d2116e103e744db2,[Veient el partit del Barça en un hotel a Manc...
2,7eab4a711ed86375a854684f4330ea81,[Social Leopard: How to master and command the...
3,5041977a41240fa87c06e5d98500921a,[Muy buen artículo: Women Rising: The Unseen B...
4,751221978067db4e4cecef07c1763f2c,[Visiting our neighbors&#39; MLS Committee to ...
5,b88171637fa04a302e94b14402f2793a,"[<a href=""/scottsanchez"" class=""twitter-atrepl..."
6,f5abf96f244c876d17ecf69863cb0abb,"[@socialmurcia1 <a href=""/smerigom"" class=""twi..."
7,330afc0d53fa05e546fef9ec5ee76d3d,"[I wish there was a late night <a href=""/hasht..."
8,b9114397ca060ce9de73d8073b0c53c3,[Infographic: Doctors Prescribing More Mobile ...
9,cb9088be965f6ef60167811b4911d083,"[I always got your back, BFE<a href=""http://t...."


Funkcija za preprocesiranje podataka


In [7]:
def preprocess(tweet):
    tt = TweetTokenizer()
    soup = BeautifulSoup(tweet)
    text = soup.getText().replace('&nbsp', ' ')

    #uklanjanje hiperlinkova
    text = re.sub(r"http\S+", "URL", text)

    #uklanjanje brojeva
    text = re.sub(r"(\+|\-)*(\d+\.\d+)|\+(\d+)|\-(\d+)|(\d+)", "NUMBER", text)

    return tt.tokenize(text)


** 3. Preprocessiranje mape **


In [8]:
prepro_map = {}

pbar = pyprind.ProgBar(len(mapa))

for key in mapa:
    prepro_map[key] = [preprocess(tweet) for tweet in mapa[key]]
    pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:03:11


** 4. Učitavanje spola i dobi korisnika **

Enumeracije spola i dobi

In [10]:
from enum import Enum
from dataset_map_entry import Gender
from dataset_map_entry import AgeGroup

Razred u koji se sprema value mape: (spol, dob, lista sa tvitovima korisnika), pri cemu je kljuc korinsikov id

In [11]:
from dataset_map_entry import TweetMapEntry

In [12]:

final_map = {}
pbar = pyprind.ProgBar(len(prepro_map))

with open('./input/truth.txt','r') as fl:
    for f in fl:
        splitted = f.strip().split(':::')
        #print splitted[0], Gender[splitted[1]], AgeGroup["_" + splitted[2].replace('-','_')]
        final_map[splitted[0]] = TweetMapEntry(Gender[splitted[1]]
                                               , AgeGroup["_" + splitted[2].replace('-','_')], prepro_map[splitted[0]])
        pbar.update()

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:00


** 5. Spremanje mape u datoteku **

In [13]:
output_dir = open('./output/map_final.pkl', 'wb')
pickle.dump(final_map, output_dir)
output_dir.close()

** 6. Primjer ispisa mape sa samo jednim primjerkom twita **


In [14]:
df = pd.DataFrame()

for key in final_map:
    if len(final_map[key].get_tweets()) > 0:
        entry = final_map[key]
        df = df.append([[key, entry.get_gender(), entry.get_age_group(), entry.get_tweets()[0]]], ignore_index=True)

df.columns = ['user_id', 'gender', 'age group', 'only one tweet from list']

print "Broj praznih entry-a!! : ", no_of_empty, "/", len(mapa)
df.head(10)

Broj praznih entry-a!! :  15 / 436


Unnamed: 0,user_id,gender,age group,only one tweet from list
0,f677a164b16096fc2eb3af86694da8ac,Gender.MALE,AgeGroup._25_34,"[Flashback, :, WSJ, (, NUMBER, /, NUMBER, /, N..."
1,15bfa35315fb53d6d2116e103e744db2,Gender.MALE,AgeGroup._35_49,"[Veient, el, partit, del, Barça, en, un, hotel..."
2,833f9711a4e415b8398c8ddffbeac33d,Gender.FEMALE,AgeGroup._18_24,"[meet, Karen, Lee, !, :D]"
3,5041977a41240fa87c06e5d98500921a,Gender.FEMALE,AgeGroup._50_64,"[Muy, buen, artículo, :, Women, Rising, :, The..."
4,751221978067db4e4cecef07c1763f2c,Gender.FEMALE,AgeGroup._35_49,"[Visiting, our, neighbors, MLS, Committee, to,..."
5,b88171637fa04a302e94b14402f2793a,Gender.MALE,AgeGroup._25_34,"[@scottsanchezthanks, for, the, @, :-)]"
6,f5abf96f244c876d17ecf69863cb0abb,Gender.FEMALE,AgeGroup._35_49,"[@socialmurciaNUMBER, @smerigom, @MaribelRamnP..."
7,b0d2501202d406fd07ab84db6f751543,Gender.MALE,AgeGroup._50_64,"[A, chance, to, keep, the, folks, who, never, ..."
8,b9114397ca060ce9de73d8073b0c53c3,Gender.FEMALE,AgeGroup._35_49,"[Infographic, :, Doctors, Prescribing, More, M..."
9,cb9088be965f6ef60167811b4911d083,Gender.MALE,AgeGroup._25_34,"[I, always, got, your, back]"
