## Unite all data

In [4]:
import pandas as pd
from gensim.models import Doc2Vec
from sklearn import utils
from sklearn.model_selection import train_test_split
import gensim
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
import re
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import nltk
import multiprocessing
from sklearn.metrics import accuracy_score, f1_score

cores = multiprocessing.cpu_count()

positives=pd.read_csv('Data/positive_combined_withreal.csv')
cases=pd.read_csv('Data/case_files.csv')

positives['class']=positives['class'].str.replace(' ','')
cases_clean=pd.DataFrame(data={'class':cases['Case Type'],'complain':cases['Case Summary']})

df=pd.concat([positives,cases_clean], axis= 0)
df['class']=df['class'].apply(lambda x: x[0].upper()+x[1:])
df['class'].unique()

array(['Notrelated', 'Labor', 'Adult', 'Minor'], dtype=object)

## Text preprocessing

In [5]:
nltk.download('punkt')
from nltk.corpus import stopwords
def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 2:
                continue
            tokens.append(word.lower())
    return tokens

[nltk_data] Downloading package punkt to /Users/maria/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
pd.set_option('display.max_colwidth', -1)
df=df.applymap(lambda x: re.sub("[^a-zA-Z0-9 ]",'',str(x)))
df['complain']=df['complain'].apply(lambda x: re.sub('notrelated|nannan','',x))
df=df.sample(frac=1) #randomize our groups

In [7]:
df[df['class']=='Notrelated'].head(10)

Unnamed: 0,class,complain
223,Notrelated,Pay is on low side of average Smallish company with bad HR habits forced shutdowns during XmasNew Year with little 12 weeks notice cannot make plans Review process barely controlled in timing and Budgets tight due to business or other investments Custom ERPMRP systems which are dated
215,Notrelated,1 Manipulative environment I was apart of Paramit for several years and throughout the entirety of my duration I have contributed so much to the company In the recent years Paramit has become short staffed read other reviews to see why those who value their mental health left and has thus asked its current employees to step up and take over more responsibilities often times forcing many of its workers to work overtime Yes a company cant FORCE you to work overtime but if you dont work overtime they passively aggressively comment about your progress the next work day for falling behind on assignments They internalize you declining to work over time and hold this against you for a LONG time because you refused their request more like demand to work overtime Working overtime should be MY choice and I should NOT be punished for only working 40 HOURS if I choose to Other people have families dependents situations outside of work that requires their attention They should not be allowed to hold the inability to work overtime EVERY WEEK over our heads 2 Dismal Opportunities to Raise Salary Dont even bother trying to get a raise at this company They LOVE taking advantage of unaware foreign employees by paying them such laughable wages The amount of work they force upon you because theyre short staffed and cant seem to find new employees to stay long enough is NOT worth the pathetic pay When I worked here I played a very large role in my department to maintain customer satisfaction and constantly exhausted myself to ensure the quality of my work After recognizing that the amount of responsibilities I had at that time versus from when I first started had grown exponentially as did my skill set human capital and self worth I requested for a raise only to be met with a raise of MERE CENTS This was a wake up call for me This was how much I was worth to them If you are a current employee I recommend you to search online for the average salary of your line of work to see how much you actually are worth and find a different place that sees you for what youre skills and experience Im glad I did This company obviously doesnt value its employees evident in the narrow career growth opportunities 3 TERRIBLE MANAGEMENT There is such clear favoritism in this work place to the point of ostensible discrimination If you arent a certain ethnicity good luck with getting treated with respect There was a point in time where the manager tried to BAN THE USAGE OF FOREIGN LANGUAGES This is America and I am protected under the 1st Amendment of the US CONSTITUTION to SPEAK HOWEVER I WANT IN WHATEVER DIALECT I want Even if I was not apart of the majority ethnic group that action alone was already very questionable The fact that they tried to strip away certain GUARANTEED rights should already be an indicative red flag if you havent spotted any in my earlier points Thankfully HR was able to prevent an unconstitutional demand but that could have prompted legal action so if you are a current employee and this happens to you take action against this immediately Know your rights The toxicity of this place is enough to make anyone miserable Not only is management very unorganized and run poorly but they also have the tendency to take their anger out on their employees They REALLY make the effort to belittle you if you make a mistake and treat you with such blatant disrespect No I am not 3 years old No I do not need you to talk to me in a condescending voice Theres a difference between constructive criticism and insulting employees The amount of sheer disrespect I have experienced by the management here is honestly shocking and disgusting
152,Notrelated,Hectic work schedule strict office clockin timings late night and weekend working expected during commercialization comparatively low pay
3,Notrelated,I have nothing bad to report
62,Notrelated,None that come to mind at the moment
12,Notrelated,Many formal leadership trainings are not accessible due to tight budgets and not enforced by corporate
2,Notrelated,lot of turnover in the front desk department Favoritism within certain heads of department Higher ups do not seem genuine Overworked with 4 jobs at once and not enough employees to get support Claims to get 7 free nights but rarely ever gets approved Open door policy however your concerns are overlooked
145,Notrelated,Communication doesnt happen easily You have to chase down pay on a regular basis TERRIBLE BENEFITS No PTO family health insurance cost more that you get paidnan
267,Notrelated,Lack of structure cooperate looks down on workers and talk to them in a very condescending way Very unprofessional they will fire you over the phone Without any valid reason
54,Notrelated,Those in power ie managers etc are all that matters Lower level good luck


In [None]:
df.to_csv('all_data_masha.csv')

In [87]:
train, test = train_test_split(df, test_size=0.3, random_state=42)
train_tagged = train.apply(lambda r: TaggedDocument(words=tokenize_text(r['complain']), tags=[r['class']]), axis=1)
test_tagged = test.apply(lambda r: TaggedDocument(words=tokenize_text(r['complain']), tags=[r['class']]), axis=1)


In [88]:
train_tagged.values[30]

TaggedDocument(words=['management', 'can', 'not', 'stress', 'this', 'enough', 'they', 'hire', 'desperate', 'transplants', 'who', 'would', 'rather', 'try', 'and', 'be', 'friend', 'to', 'only', 'get', 'you', 'fired', 'very', 'weird', 'creepy', 'and', 'handsy', 'if', 'not', 'that', 'the', 'managers', 'were', 'always', 'absent', 'on', 'peak', 'days', 'or', 'would', 'over', 'staff', 'on', 'slow', 'days', 'there', 'was', 'no', 'team', 'building', 'managers', 'would', 'only', 'play', 'into', 'the', 'games', 'and', 'trash', 'talk', 'they', 'also', 'punished', 'people', 'for', 'write', 'ups', 'to', 'get', 'them', 'fired', 'when', 'they', 'knew', 'who', 'was', 'stealing', 'to', 'add', 'if', 'they', 'would', 'stay', 'to', 'close', 'these', 'things', 'wouldnt', 'happen', 'also', 'to', 'add', 'there', 'was', 'no', 'way', 'to', 'voice', 'concerns', 'people', 'would', 'fake', 'care', 'and', 'tell', 'whomever', 'is', 'the', 'issue', 'to', 'cause', 'further', 'problems', 'rather', 'than', 'resolve', 'i

## Training the model

In [89]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in train_tagged.values])

In [90]:
model_dbow.train(train_tagged.values, total_examples=len(train_tagged.values),epochs=30)

In [91]:
 def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [92]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [93]:
logreg = LogisticRegression(n_jobs=1, C=1e5)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

In [94]:
print('Testing accuracy %s' % accuracy_score(y_test, y_pred))
print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))


Testing accuracy 0.6298850574712643
Testing F1 score: 0.6128767626559046


## Test from differen source

https://www.theguardian.com/global-development/2017/jul/29/slept-floor-flat-near-harrods-stories-modern-slavery

In [95]:
text1='When my husband became very sick and couldn’t work, I used an employment agency to find me work abroad. I was sent to Qatar, but the family were cheating me, paying me less than agreed in my contract and refusing to give me a day off. I called the agency in the Philippines for help, but they never answered. I had to send money back home to pay for food, school fees and medicine. I fought with my employer about my salary, but he would say Your contract is just a piece of paper'

text2='I worked 12-hour shifts and finished every day at 6pm, the same time that the gang curfew in our city came into effect. There are two main gangs in the area, and anyone on the streets after the curfew becomes a target. Every day I thought might be my last. One evening, my co-worker and I were walking to the bus stop when three gang members stopped and said we’d have to sell sex and drugs for them. “We’re not asking you,” they said. “We’re giving you an order.” They let us go, but I was terrified. The next night after work, they were there, waiting for us. “Time’s up,” they said, and they forced us into a car at gunpoint'

text3='One of my friends in the village said he and a few others were leaving to find work. The next day we all got a taxi and headed for Thailand. We were met by a man who said we could work on his cassava farm, earning $130 (£99) a month each, with room and board included. We worked seven days a week, morning until night, for a month, until one evening a Thai man asked how much we were earning. He offered us $200 a month to work on a construction site, but said we’d have to move to Thailand We were confused. Weren’t we already in Thailand? It turned out we were still in Cambodia, and the farmer had already fled without giving us any wages. We were left with no choice but to accept the deal and smuggle ourselves over the border. The man said we’d be charged for being driven to the construction site, but that it could be deducted from our first month’s wages. It was a long, uncomfortable drive in a pickup, and when we finally stopped, we saw that we weren’t at a construction site, but a busy sea port. The broker said the building site had closed, so he’d arranged for us to work on a fishing boat instead'


In [96]:
def check_independent(text):
    text=re.sub("[^a-zA-Z0-9 ]",'',str(text))
    words=tokenize_text(text)
    regg=model_dbow.infer_vector(words, steps=20)
    print(logreg.predict([regg]))

In [97]:
check_independent(text3)

['Labor']


In [98]:
check_independent(text1)

['Labor']


In [99]:
check_independent(text2)

['Minor']


In [102]:
import pickle

pickle.dump([model_dbow,logreg],open('word2vecworking1.pickle','wb'))