# **identify and classify documents by topic project**

# reading data

our data set consists of 10 folders each folder contains 100 document of a particular field/topic so we'll be adding them to our pandas dataframe and give it a label . so our data well be labeled as the following --> ( business:1 , entertainement:2 , food:3 , graphics:4 , historical:5 , medical:6 , politics:7 , space:8 , sport:9 , technologie:10 )

In [4]:
import numpy as np
import pandas as pd
import nltk
import re 
import string                                        
from nltk.corpus import stopwords        
from nltk.stem import PorterStemmer 
from nltk.tokenize import TweetTokenizer 
import math

In [6]:
def read_file (path):
  s_doc = []
  for i in range(1,100):
    s =  str(i) +'.txt'
    final_path = path + s 
    with open(final_path , 'r') as reader:
      s_doc.append(reader.read())
  
  return s_doc


In [7]:
ind_list = []
doc_list = []
df = pd.DataFrame(columns=['document' , 'label'])
datadir = '/content/drive/MyDrive/archive/'
classes = ['business' , 'entertainment' , 'food' , 'graphics' , 'historical' , 'medical' , 'politics' , 'space' , 'sport' , 'technologie']
for c in classes :
  path = datadir + c + '/' + c + '_'
  class_num = classes.index(c) + 1
  doc_list = read_file(path)
  tmp_df = pd.DataFrame(columns=['document' , 'label'])
  tmp_df['document'] = doc_list
  tmp_df['label'] = class_num
  df = pd.concat([df , tmp_df] , ignore_index=True)


In [8]:
df.head()

Unnamed: 0,document,label
0,Lufthansa flies back to profit\n\nGerman airli...,1
1,Japanese growth grinds to a halt\n\nGrowth in ...,1
2,WorldCom director admits lying\n\nThe former c...,1
3,Glaxo aims high after profit fall\n\nGlaxoSmit...,1
4,Peugeot deal boosts Mitsubishi\n\nStruggling J...,1


In [9]:
df.label.unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=object)

# preprocessing

In [10]:
nltk.download("stopwords")
stop_words= stopwords.words('english')
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
def clean(doc):
    doc = re.sub(r'\$\w*', '', doc)
    doc = re.sub(r'^RT[\s]+', '', doc)
    doc = re.sub(r'https?:\/\/.[\r\n]', '', doc)
    doc = re.sub(r'#', '', doc)
    doc = re.sub(r'[0123456789\’\…\.\/_\”]*', '', doc)

    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tokens = tokenizer.tokenize(doc)
    
    cleandoc = []    
    for word in tokens:
        if (word not in stop_words and  word not in string.punctuation):
            stem_word = stemmer.stem(word) 
            cleandoc.append(stem_word)
            
    return cleandoc

In [12]:
clean_docs = []
for i in range(df.shape[0]):
  clean_docs.append(clean(df.iloc[i]['document']))


In [13]:
df['clean_tokens'] = clean_docs

In [14]:
df.head()

Unnamed: 0,document,label,clean_tokens
0,Lufthansa flies back to profit\n\nGerman airli...,1,"[lufthansa, fli, back, profit, german, airlin,..."
1,Japanese growth grinds to a halt\n\nGrowth in ...,1,"[japanes, growth, grind, halt, growth, japan, ..."
2,WorldCom director admits lying\n\nThe former c...,1,"[worldcom, director, admit, lie, former, chief..."
3,Glaxo aims high after profit fall\n\nGlaxoSmit...,1,"[glaxo, aim, high, profit, fall, glaxosmithkli..."
4,Peugeot deal boosts Mitsubishi\n\nStruggling J...,1,"[peugeot, deal, boost, mitsubishi, struggl, ja..."


In [15]:
df.clean_tokens[0]

['lufthansa',
 'fli',
 'back',
 'profit',
 'german',
 'airlin',
 'lufthansa',
 'return',
 'profit',
 'post',
 'huge',
 'loss',
 'preliminari',
 'report',
 'airlin',
 'announc',
 'net',
 'profit',
 'euro',
 '£',
 'compar',
 'loss',
 'euro',
 'oper',
 'profit',
 'euro',
 'ten',
 'time',
 'lufthansa',
 'hit',
 'tough',
 'competit',
 'dip',
 'demand',
 'follow',
 'iraq',
 'war',
 'killer',
 'sar',
 'viru',
 'also',
 'hit',
 'troubl',
 'us',
 'cater',
 'busi',
 'last',
 'year',
 'lufthansa',
 'show',
 'sign',
 'recoveri',
 'even',
 'european',
 'us',
 'airlin',
 'teeter',
 'brink',
 'bankruptci',
 'board',
 'lufthansa',
 'recommend',
 'pay',
 'dividend',
 'euro',
 'per',
 'share',
 'sharehold',
 'get',
 'dividend',
 'compani',
 'said',
 'give',
 'detail',
 'result',
 'march']

# training logistic regression classifier

In [16]:
def frequency( token_lists , label ):
  freq = {}
  count = 1 
  for i in range(len(token_lists)):
    for token in token_lists[i]:
      key = token
      if token in freq.keys() :
        count = count + 1
      else :
        count = 1
      value = [count , label]
      freq[key] = value
  return freq


In [17]:
freq = {}
for i in range(1,11):
  class_alone = df.query('label == ' + str(i))['clean_tokens']
  class_alone = class_alone.set_axis(range(0,99) , axis=0)
  freq2 = frequency(class_alone , i)
  freq.update(freq2)


In [18]:
freq

{'lufthansa': [2, 1],
 'fli': [25, 9],
 'back': [18, 10],
 'profit': [2, 10],
 'german': [1, 10],
 'airlin': [30, 8],
 'return': [15, 10],
 'post': [12, 10],
 'huge': [9, 10],
 'loss': [9, 8],
 'preliminari': [4, 9],
 'report': [2, 10],
 'announc': [12, 10],
 'net': [16, 10],
 'euro': [16, 10],
 '£': [3, 10],
 'compar': [34, 10],
 'oper': [6, 10],
 'ten': [4, 10],
 'time': [2, 10],
 'hit': [3, 10],
 'tough': [5, 10],
 'competit': [9, 10],
 'dip': [6, 9],
 'demand': [18, 10],
 'follow': [36, 10],
 'iraq': [37, 7],
 'war': [2, 10],
 'killer': [1, 9],
 'sar': [4, 8],
 'viru': [3, 10],
 'also': [4, 10],
 'troubl': [16, 10],
 'us': [12, 10],
 'cater': [1, 1],
 'busi': [2, 10],
 'last': [3, 10],
 'year': [9, 10],
 'show': [36, 10],
 'sign': [5, 10],
 'recoveri': [14, 9],
 'even': [4, 10],
 'european': [4, 10],
 'teeter': [1, 1],
 'brink': [1, 1],
 'bankruptci': [3, 1],
 'board': [10, 10],
 'recommend': [5, 10],
 'pay': [27, 10],
 'dividend': [1, 9],
 'per': [4, 10],
 'share': [5, 10],
 'shar

In [19]:
#@title
def sigmoid(z): 
    '''
    Input:
        z: is the input (can be a scalar or an array)
    Output:
        h: the sigmoid of z
    '''
    #assert(z.shape == (1, 11))
    # calculate the sigmoid of z
    h = 1 / (1 + np.exp(-z))
 
    
    return h

In [20]:
#@title
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
    Hint: you might want to print the cost to make sure that it is going down.
    '''
    # get 'm', the number of rows in matrix x
    m = 990     
    for i in range(0, num_iters):
        
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        
        # get the sigmoid of h
        h = sigmoid(z)
        
        # calculate the cost function
        J = -1./m * (np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(),np.log(1-h)))                                                    

        # update the weights theta
        theta = theta - (alpha/m) * np.dot(x.transpose(),(h-y))
        
    #J.float()
    return J, theta

In [21]:
#@title
def extract_features(doc_list, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
   
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 11)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    # loop through each word in the list of words
    for word in doc_list :
      if word in freqs.keys():
        if freqs[word][1] == 1 :
          x[0,1] += 1 
        if freqs[word][1] == 2 :
          x[0,2] += 1 
        if freqs[word][1] == 3 :
          x[0,3] += 1       
        if freqs[word][1] == 4 :
          x[0,4] += 1   
        if freqs[word][1] == 5 :
          x[0,5] += 1        
        if freqs[word][1] == 6 :
          x[0,6] += 1        
        if freqs[word][1] == 7 :
          x[0,7] += 1        
        if freqs[word][1] == 8 :
          x[0,8] += 1   
        if freqs[word][1] == 9 :
          x[0,9] += 1 
        if freqs[word][1] == 10 :
          x[0,10] += 1   

    assert(x.shape == (1, 11))
    return x

In [22]:
#@title
x = []
for t_list in df.clean_tokens :
  tmp = extract_features(t_list, freq)
  x.append(tmp)


In [23]:
#@title
df['x'] = x

In [24]:
#@title
df.head()

Unnamed: 0,document,label,clean_tokens,x
0,Lufthansa flies back to profit\n\nGerman airli...,1,"[lufthansa, fli, back, profit, german, airlin,...","[[1.0, 10.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 6.0..."
1,Japanese growth grinds to a halt\n\nGrowth in ...,1,"[japanes, growth, grind, halt, growth, japan, ...","[[1.0, 10.0, 1.0, 0.0, 0.0, 2.0, 0.0, 7.0, 2.0..."
2,WorldCom director admits lying\n\nThe former c...,1,"[worldcom, director, admit, lie, former, chief...","[[1.0, 26.0, 0.0, 0.0, 0.0, 2.0, 9.0, 4.0, 1.0..."
3,Glaxo aims high after profit fall\n\nGlaxoSmit...,1,"[glaxo, aim, high, profit, fall, glaxosmithkli...","[[1.0, 11.0, 2.0, 0.0, 0.0, 0.0, 2.0, 1.0, 4.0..."
4,Peugeot deal boosts Mitsubishi\n\nStruggling J...,1,"[peugeot, deal, boost, mitsubishi, struggl, ja...","[[1.0, 33.0, 2.0, 0.0, 0.0, 1.0, 1.0, 1.0, 4.0..."


In [25]:
#@title
X = np.zeros((len(df), 11))
for i in range(len(df)):
    X[i, :]= extract_features(df.clean_tokens[i], freq)

In [26]:
#@title
X.shape

(990, 11)

In [27]:
#@title
y= df.label.astype(float)

In [28]:
#@title
J, theta = gradientDescent(X, y.values, np.zeros((11, 1)), 1e-9,440)

In [29]:
#@title
theta

array([[2.19648694e-07, 2.19648694e-07, 2.19648694e-07, ...,
        4.17339093e-06, 4.17339093e-06, 4.17339093e-06],
       [3.74298426e-07, 3.74298426e-07, 3.74298426e-07, ...,
        7.11167969e-06, 7.11167969e-06, 7.11167969e-06],
       [5.37991529e-07, 5.37991529e-07, 5.37991529e-07, ...,
        1.02218465e-05, 1.02218465e-05, 1.02218465e-05],
       ...,
       [4.14966194e-06, 4.14966194e-06, 4.14966194e-06, ...,
        7.88672274e-05, 7.88672274e-05, 7.88672274e-05],
       [3.30274436e-06, 3.30274436e-06, 3.30274436e-06, ...,
        6.27597366e-05, 6.27597366e-05, 6.27597366e-05],
       [3.68850652e-05, 3.68850652e-05, 3.68850652e-05, ...,
        7.00942710e-04, 7.00942710e-04, 7.00942710e-04]])

In [30]:
#@title
print(J)

[0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246
 0.65856246 0.65856246 0.65856246 0.65856246 0.65856246 0.6585

In [31]:
def hypothesis(theta , X):
    z = np.dot(X, theta)
    return 1/(1+np.exp(-(z)))

In [32]:
def cost(X, y, theta):
    y1 = hypothesis(X, theta)
    return -(1/len(X)) * np.sum(y*np.log(y1) + (1-y)*np.log(1-y1))

In [34]:
def gradient_descent(X, y, theta, alpha, epochs):
    m = len(X)
    for i in range(0, epochs):
        for j in range(0, 10):
            theta = pd.DataFrame(theta)
            h = hypothesis(theta.iloc[:,j], X)
            for k in range(0, theta.shape[0]):
                theta.iloc[k, j] -= (alpha/m) * np.sum((h-y.iloc[:, j])*X.iloc[:, k])
            theta = pd.DataFrame(theta)
    return theta, cost

In [35]:
X = pd.DataFrame(X)

In [70]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,10.0,0.0,0.0,0.0,0.0,0.0,1.0,6.0,7.0,52.0
1,1.0,10.0,1.0,0.0,0.0,2.0,0.0,7.0,2.0,10.0,123.0
2,1.0,26.0,0.0,0.0,0.0,2.0,9.0,4.0,1.0,9.0,155.0
3,1.0,11.0,2.0,0.0,0.0,0.0,2.0,1.0,4.0,6.0,94.0
4,1.0,33.0,2.0,0.0,0.0,1.0,1.0,1.0,4.0,10.0,126.0
...,...,...,...,...,...,...,...,...,...,...,...
985,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,487.0
986,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,172.0
987,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,280.0
988,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0


In [37]:
y = pd.DataFrame()

In [38]:
y = pd.get_dummies(df.label)

In [39]:
y.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,1,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0


In [91]:
theta = np.zeros([X.shape[1], y.shape[1]])
theta , cost = gradient_descent(X, y, theta, 1e-9, 2000)

In [92]:
theta

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-7.940677e-07,-7.938147e-07,-7.934868e-07,-7.939001e-07,-7.955156e-07,-7.941582e-07,-7.943769e-07,-7.943872e-07,-7.938067e-07,-7.949211e-07
1,1.713963e-06,-1.693648e-06,-1.693077e-06,-1.693787e-06,-1.696474e-06,-1.69419e-06,-1.694605e-06,-1.694537e-06,-1.693619e-06,-1.695557e-06
2,-2.214869e-06,2.23807e-06,-2.435949e-06,-2.436754e-06,-2.439831e-06,-2.437219e-06,-2.437682e-06,-2.437639e-06,-2.436579e-06,-2.438752e-06
3,-2.278316e-06,-2.267942e-06,2.215161e-06,-2.344803e-06,-2.346562e-06,-2.345125e-06,-2.345317e-06,-2.345375e-06,-2.344718e-06,-2.345898e-06
4,-2.510261e-06,-2.451025e-06,-2.489398e-06,2.493763e-06,-2.625597e-06,-2.600604e-06,-2.604743e-06,-2.606034e-06,-2.59399e-06,-2.615093e-06
5,-9.243017e-06,-9.098505e-06,-9.182112e-06,-9.470524e-06,7.731831e-06,-9.863265e-06,-9.869855e-06,-9.869669e-06,-9.85353e-06,-9.885186e-06
6,-8.025005e-06,-8.021287e-06,-6.500434e-06,-8.020592e-06,-6.428128e-06,3.991266e-06,-8.372088e-06,-8.372552e-06,-8.350326e-06,-8.393115e-06
7,-6.015246e-06,-6.274106e-06,-6.382403e-06,-6.665222e-06,-3.503781e-06,-6.047085e-06,2.365166e-07,-6.966962e-06,-6.958745e-06,-6.975318e-06
8,-1.737608e-05,-1.70597e-05,-1.627602e-05,-1.568363e-05,-1.301456e-05,-1.501505e-05,-1.707015e-05,9.460907e-07,-1.853042e-05,-1.860845e-05
9,-1.340805e-05,-1.315752e-05,-1.298152e-05,-1.372407e-05,-1.040689e-05,-1.182641e-05,-1.230895e-05,-1.234475e-05,-3.582308e-06,-1.489794e-05


# testing

In [46]:
test_set= []
for c in classes :
  path =datadir + c + '/' + c + '_'
  s =  str(100) +'.txt'
  final_path = path + s 
  with open(final_path , 'r') as reader:
    test_set.append(reader.read())
  
    

In [47]:
len(test_set)

10

In [48]:
y_t = [1,2,3,4,5,6,7,8,9,10]
y_test = pd.get_dummies(y_t)

In [49]:
y_test

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,1,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,0,1,0,0
8,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,0,0,0,1


In [51]:
clean_test = []
for i in range(10):
  clean_test.append(clean(test_set[i]))

In [96]:
x_test = np.zeros((len(clean_test), 11))
for i in range(10):
    x_test[i, :]= extract_features(clean_test[i], freq)

In [97]:
y_pred = []
for i in range(0, 10):
    theta1 = pd.DataFrame(theta)
    h = hypothesis(theta1.iloc[:,i], x_test)
    y_pred.append(h)
y_pred=pd.DataFrame(y_pred)

In [98]:
y_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.496772,0.496716,0.497022,0.498635,0.488565,0.498866,0.493676,0.49726,0.495623,0.489346
1,0.49663,0.496576,0.496898,0.498577,0.488071,0.498816,0.493396,0.497156,0.495435,0.488877
2,0.496445,0.496386,0.496753,0.498501,0.487432,0.498751,0.493033,0.497026,0.495191,0.488258
3,0.496676,0.49662,0.496943,0.4986,0.488236,0.498834,0.49348,0.497211,0.495493,0.489026
4,0.497556,0.497509,0.497738,0.498967,0.491425,0.499136,0.495194,0.497919,0.496686,0.491889
5,0.496803,0.49675,0.49709,0.498654,0.488693,0.49888,0.493735,0.497321,0.495686,0.489449
6,0.496941,0.496888,0.497177,0.498704,0.48917,0.498924,0.494074,0.497393,0.495853,0.489896
7,0.496916,0.496883,0.497233,0.498721,0.489191,0.498921,0.493938,0.497641,0.495826,0.489814
8,0.496621,0.496561,0.4969,0.498569,0.488023,0.498811,0.493372,0.497142,0.495465,0.488833
9,0.497253,0.497199,0.497436,0.498833,0.490216,0.499034,0.494602,0.497608,0.496252,0.490944


In [102]:
accuracy = 0
for col in range(0,10):
    for row in range(len(y_pred)):
        if y.iloc[row, col] == 1 and y_pred.iloc[col, row] >= 0.49:
            accuracy += 1
accuracy = accuracy/len(x_test)

In [103]:
accuracy

0.8