# workshop 6

Name: Yajing Zhou,  Michael Uzoma

In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
import gensim
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LogisticRegressionCV, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import confusion_matrix, accuracy_score
import re


In [2]:
stopwords = set(stopwords.words('english'))

In [3]:
def read_directory(directory):
    """
    Lists all file paths from given directory
    """

    ret_val = []
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            ret_val.append(str(directory) + "/" + str(file))
    return ret_val

In [4]:
def read_file(path):
    """
    Reads all lines from file on given path
    """

    f = open(path, "r")
    read = f.readlines()
    ret_val = []
    for line in read:
        if line.startswith("#"):
            pass
        else:
            ret_val.append(line)
    return ret_val


In [5]:
def read_line(line):
    """
    Returns sentence category and sentence in given line
    """

    splits = []
    s_category = ""
    sentence = ""
    if "\t" in line:
        splits = line.split("\t")
        s_category = splits[0]
        sentence = splits[1].lower()
    else:
        splits = line.split(" ")
        s_category = splits[0]
        sentence = line[len(s_category)+1:].lower()

    sentence = " ".join([word for word in word_tokenize(sentence) if word not in stopwords])
    # for sw in stopwords:
    #     sentence = sentence.replace(sw, "")
    pattern = re.compile("[^\w']")
    sentence = pattern.sub(' ', sentence) # Any non-characters (here ^ is for negation and not for the start) replace with white space
    sentence = re.sub(' +', ' ', sentence) # If more than one spaces, make them just one space
    return s_category, sentence

In [6]:
def read_traindata(input_folder):
    """
    Maps each sentence to it's category
    """

    test_folder = read_directory(input_folder)
    t_sentences = []
    t_categories = []
    for file in test_folder:
        lines = read_file(file)
        for line in lines:
            c, s = read_line(line)
            if s.endswith('\n'):
                s = s[:-1]
            t_sentences.append(s)
            t_categories.append(c)
    return t_categories, t_sentences

In [7]:
categories, sentence = read_traindata('labeled_dataset')


In [8]:
sentence

['although internet level topology extensively studied past years little known details taxonomy',
 'node represent wide variety organizations e g large isp small private business university vastly different network characteristics external connectivity patterns network growth tendencies properties hardly neglect working veracious internet representations simulation environments',
 'paper introduce radically new approach based machine learning techniques map ases internet natural taxonomy',
 'successfully classify number number percent ases expected accuracy number number percent',
 'release community level topology dataset augmented number taxonomy information number set attributes used classify ases',
 'believe dataset serve invaluable addition understanding structure evolution internet',
 'rapid expansion internet last two decades produced large scale system thousands diverse independently managed networks collectively provide global connectivity across wide spectrum geopolitical env

In [9]:
df = pd.DataFrame(
data = {
    'categories':categories,
    'sentence':sentence
})
df

Unnamed: 0,categories,sentence
0,MISC,although internet level topology extensively s...
1,MISC,node represent wide variety organizations e g ...
2,AIMX,paper introduce radically new approach based m...
3,OWNX,successfully classify number number percent as...
4,OWNX,release community level topology dataset augme...
...,...,...
310,OWNX,fig example
311,MISC,however similar situation often occurs clustering
312,MISC,clustering algorithms cases give almost global...
313,MISC,thus better clustering assignment constructed ...


In [10]:
doc = df.sentence
label = categories

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(doc,label, test_size = 0.33,random_state = 0)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
doc_count = count_vect.fit_transform(doc)
doc_count.shape

(315, 845)

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf = False).fit(doc_count)
doc_tf = tf_transformer.transform(doc_count)
doc_tf.shape

(315, 845)

In [14]:
tfidf_transformer = TfidfTransformer()

In [15]:
doc_tfidf = tfidf_transformer.fit_transform(doc_count)
doc_tfidf.shape

(315, 845)

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
clf = MultinomialNB()

In [18]:
x_clf = clf.fit(doc_tfidf,label)
x_clf

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
def read_line(line):
    """
    Returns sentence category and sentence in given line
    """

    splits = []
    s_category = ""
    sentence = ""
    if "\t" in line:
        splits = line.split("\t")
        
        sentence = splits[0].lower()
    else:
        splits = line.split(" ")
        
        sentence = line.lower()

    sentence = " ".join([word for word in word_tokenize(sentence) if word not in stopwords])
    # for sw in stopwords:
    #     sentence = sentence.replace(sw, "")
    pattern = re.compile("[^\w']")
    sentence = pattern.sub(' ', sentence) # Any non-characters (here ^ is for negation and not for the start) replace with white space
    sentence = re.sub(' +', ' ', sentence) # If more than one spaces, make them just one space
    return sentence

In [20]:
def read_testdata(input_folder):
    """
    Maps each sentence to it's category
    """

    test_folder = read_directory(input_folder)
    t_sentences = []
    
    for file in test_folder:
        lines = read_file(file)
        for line in lines:
            s = read_line(line)
            
            t_sentences.append(s)
            
    return  t_sentences

In [21]:
unlabeled = read_testdata('unlabeled_dataset')
unlabeled

['central problem bioinformatics gene regulation find binding sites regulatory proteins ',
 'one promising approaches toward identifying short fuzzy sequence patterns comparative analysis orthologous intergenic regions related species ',
 'analysis complicated various factors ',
 'first one needs take phylogenetic relationship species account order distinguish conservation due occurrence functional sites spurious conservation due evolutionary proximity ',
 'second one deal complexities multiple alignments orthologous intergenic regions one consider possibility functional sites may occur outside conserved segments ',
 'present new motif sampling algorithm phylogibbs runs arbitrary collections multiple local sequence alignments orthologous sequences ',
 'algorithm searches ways arbitrary number binding sites arbitrary number transcription factors assigned multiple sequence alignments ',
 'binding site configurations scored bayesian probabilistic model treats aligned sequences model evolu

In [23]:
unlabeled_counts = count_vect.transform(unlabeled)

In [24]:
unlabeled_tfidf = tfidf_transformer.transform(unlabeled_counts)

In [25]:
predicted = clf.predict(unlabeled_tfidf)
predicted

array(['MISC', 'MISC', 'MISC', 'MISC', 'MISC', 'MISC', 'OWNX', 'MISC',
       'MISC', 'MISC', 'MISC', 'OWNX', 'OWNX', 'MISC', 'OWNX', 'MISC',
       'MISC', 'MISC', 'MISC', 'OWNX', 'MISC', 'OWNX', 'MISC', 'MISC',
       'MISC', 'MISC', 'OWNX', 'OWNX', 'OWNX', 'MISC', 'OWNX', 'MISC',
       'MISC', 'OWNX', 'OWNX', 'MISC', 'OWNX', 'MISC', 'OWNX', 'MISC',
       'MISC', 'MISC', 'MISC', 'OWNX', 'MISC', 'OWNX', 'MISC', 'MISC',
       'MISC', 'OWNX', 'OWNX', 'MISC', 'OWNX', 'MISC', 'MISC', 'MISC',
       'MISC', 'MISC', 'OWNX', 'MISC', 'OWNX', 'MISC', 'MISC', 'OWNX',
       'OWNX', 'MISC', 'MISC', 'MISC', 'MISC', 'OWNX', 'OWNX', 'OWNX',
       'OWNX', 'MISC', 'OWNX', 'MISC', 'OWNX', 'MISC', 'OWNX'],
      dtype='<U5')

In [28]:
df2 = pd.DataFrame(
data = {
    'label':predicted,
    'doc':unlabeled
})
df2

Unnamed: 0,label,doc
0,MISC,central problem bioinformatics gene regulation...
1,MISC,one promising approaches toward identifying sh...
2,MISC,analysis complicated various factors
3,MISC,first one needs take phylogenetic relationship...
4,MISC,second one deal complexities multiple alignmen...
...,...,...
74,OWNX,synthetic datasets consist mixtures wm samples...
75,MISC,allows us compare performance algorithms ideal...
76,OWNX,tests also show extent binding sites recovered...
77,MISC,tests real data use 200 upstream regions sacch...


In [29]:
df2.to_csv('labeled.txt',sep='\t')

* Summary:

In this workshop, we first read all the documents and then remove the stopwords, and pair the classification with the sentence. Then, we aggregate frequency information for each sentence(document). when we get the counted vectorlist for each sentence, we calculated the tf-idf. At the end we use these data to build our classifier and predicted the unlabeled dataset's label.

In this workshop, we find that the naive bayes is a very easy way for people to prediced the text because it can helps us to classify documents quickly by probability calculation of known data.