In [None]:
# create the requirement file

In [None]:
%%writefile requirements.txt
pandas
numpy
scikit-learn
streamlit
nltk
joblib

gensim
textblob  #addfor sentiment

Overwriting requirements.txt


In [None]:
!pip install -r /content/requirements.txt



In [None]:
# import the libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import joblib
import warnings
warnings.filterwarnings('ignore')

from gensim.models import Word2Vec


In [None]:
# download the nltk resource

# split text into words
nltk.download('punkt')

# remove common words like "the,is,and,etc'
nltk.download('stopwords')

# do lemmatization
nltk.download('wordnet')

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# create utility file for reusablity

In [None]:
#set random state
# reusing the same dataset after splitting for various algo execution multiple times
np.random.seed(42)

In [None]:
# data loading from the web

import os  # to interact with system operation
import tarfile # to work with archived files
import urllib.request # to download the files from internet

# data from the discussion forum with 20 different sub categories

# download the dataset from net
url = "http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz"
archive_path = "20news-bydate.tar.gz"
urllib.request.urlretrieve(url,archive_path)


('20news-bydate.tar.gz', <http.client.HTTPMessage at 0x7c0d2fdf2f10>)

In [None]:
# extract the file
with tarfile.open(archive_path, "r:gz") as tar:
  tar.extractall(path='.')


In [None]:
# build the list of articles from the files into a dataframe
data = []
for split in ['20news-bydate-train','20news-bydate-test']: # loop thru train and test folders
  for newsgroup in sorted(os.listdir(split)): # loop thru each sub folder
    group_path = os.path.join(split,newsgroup)
    if not os.path.isdir(group_path):  # skip if there are further folder (go to next iteration)
        continue

    tgt = newsgroup.split('.')[0] # building main categorization alt/comp/rec/sci/etc

    for filename in os.listdir(group_path): # loop thru each article
        file_path = os.path.join(group_path,filename)

        with open(file_path, encoding='latin1') as f:  # read the artcile
          text = f.read()
          # append to a dictionary
          data.append(
             { 'text': text,   # full_article_text
              'tgt' : tgt,    # main category(short category)
               'true_label_original': newsgroup, # original category with subcategory
               'article_id': filename # filename
             }
         )

df = pd.DataFrame(data) # create a df from dictionary
print(df.shape)
print(df.head(1))

(18846, 4)
                                                text  tgt true_label_original  \
0  From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...  alt         alt.atheism   

  article_id  
0      51261  


In [None]:
df.tgt.nunique()

7

In [None]:
df.true_label_original.nunique()

20

In [None]:
df.tgt.unique()

array(['alt', 'comp', 'misc', 'rec', 'sci', 'soc', 'talk'], dtype=object)

In [None]:
df.true_label_original.unique()

array(['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
       'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt',
       'sci.electronics', 'sci.med', 'sci.space',
       'soc.religion.christian', 'talk.politics.guns',
       'talk.politics.mideast', 'talk.politics.misc',
       'talk.religion.misc'], dtype=object)

In [None]:
'''
# major categories
alt = alternate
comp = computer
misc = miscellaneous
rec = recreational
sci = science
soc = social
talk = dicussion
'''

'\n# major categories\nalt = alternate\ncomp = computer\nmisc = miscellaneous\nrec = recreational\nsci = science\nsoc = social\ntalk = dicussion\n'

In [None]:
display(df.head(4))

Unnamed: 0,text,tgt,true_label_original,article_id
0,From: I3150101@dbstu1.rz.tu-bs.de (Benedikt Ro...,alt,alt.atheism,51261
1,From: mathew <mathew@mantis.co.uk>\nSubject: R...,alt,alt.atheism,51240
2,From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...,alt,alt.atheism,53334
3,From: sandvik@newton.apple.com (Kent Sandvik)\...,alt,alt.atheism,53057


In [None]:
# Take 20% of the data

sample_df, _ = train_test_split(df, train_size=.2, random_state=42, stratify=df['tgt'])
orig_df = df.copy()  # ~18k
data = sample_df.copy()  # ~4k

print(data.shape)

(3769, 4)


In [None]:
'''
ax,ay,bx,by = tts(x,y)
a,_ = ttx(df)
'''

'\nax,ay,bx,by = tts(x,y)\na,_ = ttx(df)\n'

In [None]:
#data.tgt.value_counts().index
data.tgt.value_counts().keys()

Index(['comp', 'rec', 'sci', 'talk', 'soc', 'misc', 'alt'], dtype='object', name='tgt')

In [None]:
categories = {
'comp':0, 'rec':1, 'sci':2, 'talk':3, 'soc':4, 'misc':5, 'alt':6 }
data.rename(columns={'tgt':'category_name'},inplace=True)
data['category'] = data['category_name'].map(categories)
print(data.columns)

Index(['text', 'category_name', 'true_label_original', 'article_id',
       'category'],
      dtype='object')


In [None]:
display(data.head(1))

Unnamed: 0,text,category_name,true_label_original,article_id,category
6347,From: maynard@ramsey.cs.laurentian.ca (Roger M...,rec,rec.sport.hockey,53696,1


In [None]:
# create a ownlibrary and create methods into it for resuse

In [None]:
%%writefile utils.py
# utility functions for text preprocessing

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import string
import numpy as np

# create a function to lowercase, remove punctuations,tokenize , remove stopwords and lemmatization
def preprocess_text(text):

    # lowercase
    text = text.lower()

    # remove punctuations
    text = text.translate(str.maketrans('','',string.punctuation))

    # tokenize
    tokens = word_tokenize(text)

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    #leammatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

def get_word2vec_embeddings(tokens, model):
    """
    Generate Word2Vec embeddings for a list of tokens."""

    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

Overwriting utils.py


In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
#from utils import preprocess_text
from utils import preprocess_text , get_word2vec_embeddings

In [None]:
data['claened_text'] = data['text'].apply(preprocess_text)

In [None]:
display(data.head(2))

Unnamed: 0,text,category_name,true_label_original,article_id,category,claened_text
6347,From: maynard@ramsey.cs.laurentian.ca (Roger M...,rec,rec.sport.hockey,53696,1,maynardramseycslaurentianca roger maynard subj...
4879,From: James Leo Belliveau <jbc9+@andrew.cmu.ed...,rec,rec.motorcycles,104667,1,james leo belliveau jbc9andrewcmuedu subject f...


In [None]:
#data.rename(columns={'claened_text':'cleaned_text'},inplace=True)

In [None]:
# save preprocessed data
data.to_csv('preprocessed_data.csv',index=False)
print('preprocessed data saved')

preprocessed data saved


In [None]:
# to clear the modules and use it before recreation
'''
import sys
del sys.modules["utils"]
'''

import sys
# Check if 'utils' module is already in the loaded modules
if 'utils' in sys.modules:
  del sys.modules['utils']

In [None]:
# test the function
preprocess_text('abc!@##$%#$%')

'abc'

In [None]:
# create vectors - to convert text into numerics

tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(data['claened_text'])

X_tfidf.shape

(3769, 1000)

In [None]:
X_tfidf

<3769x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 169907 stored elements in Compressed Sparse Row format>

In [None]:
X_tfidf[:1, :1000].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.04008352, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.04057648, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.03764108, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  

In [None]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [None]:
# Word2Vec
tokenized_texts = [word_tokenize(text) for text in data['claened_text']]
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=50, window=5, min_count=1, workers=4)
X_w2v = np.array([get_word2vec_embeddings(text, w2v_model) for text in tokenized_texts])
w2v_model.save('word2vec.model')


In [None]:
# function to train claassification task

def train_classifier(X,y,model_name,feature_type):

  X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42,stratify=y)

  #grid option
  lr_params = {'C': [0.1,1,10]}
  lr_grid = GridSearchCV(LogisticRegression(),lr_params, cv=2)
  lr_grid.fit(X_train,y_train)


  print(f'Best logistic regression paarams: {feature_type}:',lr_grid.best_params_)
  print(f'logistic regression accuracy: {feature_type}:',accuracy_score(y_test,lr_grid.predict(X_test)))

  joblib.dump(lr_grid.best_estimator_, f'lr_model_{feature_type.lower()}.pkl')

  ##########
  # can add more algo and create models
  # gbc_model_tfidf.pkl
  # gbc_model_w2v.pkl

  # lr_model_glove.pkl
  # gbc_model_glove.pkl
  #########

  from sklearn.ensemble import GradientBoostingClassifier   #addfor gbc blk
  gbc = GradientBoostingClassifier(random_state=42)
  gbc_params = {'n_estimators':[50]}
  gbc_grid = GridSearchCV(gbc,gbc_params, cv=2)
  gbc_grid.fit(X_train,y_train)

  print(f'Best GBC paarams: {feature_type}:',gbc_grid.best_params_)
  print(f'GBC accuracy: {feature_type}:',accuracy_score(y_test,gbc_grid.predict(X_test)))

  joblib.dump(gbc_grid.best_estimator_, f'gbc_model_{feature_type.lower()}.pkl')




In [None]:
# feature_types = ['tfidf'] # , 'word2vec']
# X_features = {'tfidf': X_tfidf} #, 'word2vec': X_w2v}

feature_types = ['tfidf' , 'word2vec']
X_features = {'tfidf': X_tfidf, 'word2vec': X_w2v}



for feature_type in feature_types:
  print(f'Training classification using {feature_type}')

  train_classifier(X_features[feature_type], data['category'], 'classifier', feature_type)

Training classification using tfidf
Best logistic regression paarams: tfidf: {'C': 10}
logistic regression accuracy: tfidf: 0.7904509283819628
Best GBC paarams: tfidf: {'n_estimators': 200}
GBC accuracy: tfidf: 0.7572944297082228
Training classification using word2vec
Best logistic regression paarams: word2vec: {'C': 10}
logistic regression accuracy: word2vec: 0.6114058355437666
Best GBC paarams: word2vec: {'n_estimators': 200}
GBC accuracy: word2vec: 0.5954907161803713


In [None]:
'''
user selection of the type of vectorization(tfidf/w2v/glove/fasttext/etc) should be passed
lr_model_tfidf
lr_model_w2v
gbc_model_tfidf
gbc_model_w2v.....
'''

'\nuser selection of the type of vectorization(tfidf/w2v/glove/fasttext/etc) should be passed\nlr_model_tfidf\nlr_model_w2v\ngbc_model_tfidf\ngbc_model_w2v.....\n'

In [None]:
target_names = list(categories.keys())
joblib.dump(target_names, 'target_names.pkl')

['target_names.pkl']

In [None]:
target_names

['comp', 'rec', 'sci', 'talk', 'soc', 'misc', 'alt']

In [None]:
new_input = "indian won the cricket match"

cleaned_input  = preprocess_text(new_input)
#tokenized_input = word_tokenize(cleaned_input)

tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
lr_model = joblib.load('lr_model_tfidf.pkl')
tgt_names = joblib.load('target_names.pkl')

X_input = tfidf_vectorizer.transform([cleaned_input])

lr_pred = lr_model.predict(X_input)[0]
print(lr_pred)

print(f'Prediction: {tgt_names[lr_pred]}')


0
Prediction: comp


In [None]:
%%writefile app.py

import streamlit as st
import pandas as pd
import numpy as np
import joblib

#from utils import preprocess_text

import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

# remove common words like "the,is,and,etc'
nltk.download('stopwords')

# do lemmatization
nltk.download('wordnet')

nltk.download('punkt_tab')

from gensim.models import Word2Vec #addfor w2v
from utils import preprocess_text , get_word2vec_embeddings #addfor w2v

from sklearn.metrics.pairwise import cosine_similarity #addfor recom
from textblob import TextBlob #addfor sentiment

@st.cache_resource
def load_models():

    tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
    lr_model_tfidf = joblib.load('lr_model_tfidf.pkl')
    tgt_names = joblib.load('target_names.pkl')

    data = pd.read_csv('preprocessed_data.csv')

    w2v_model = Word2Vec.load('word2vec.model') #addfor w2v
    lr_model_word2vec = joblib.load('lr_model_word2vec.pkl') #addfor w2v

    gbc_model_tfidf = joblib.load('gbc_model_tfidf.pkl') #addfor gbc
    gbc_model_word2vec = joblib.load('gbc_model_word2vec.pkl') #addfor gbc

    X_tfidf = tfidf_vectorizer.transform(data['claened_text'])
    tokenized_texts = [word_tokenize(text) for text in data['claened_text']]

    X_w2v = np.array([get_word2vec_embeddings(text, w2v_model) for text in tokenized_texts]) #addfor w2v

    #X_features = {'tfidf': X_tfidf}
    X_features = {'tfidf': X_tfidf, 'word2vec': X_w2v} #addfor w2v


    return(tfidf_vectorizer, lr_model_tfidf, w2v_model, lr_model_word2vec,  gbc_model_tfidf, gbc_model_word2vec, tgt_names, data, X_features) #addfor w2v


tfidf_vectorizer, lr_model_tfidf, w2v_model, lr_model_word2vec, gbc_model_tfidf, gbc_model_word2vec, tgt_names, data, X_features = load_models() #addfor w2v



st.title('Text App for Forum')


#task = st.sidebar.selectbox('Select Task',["Classification"]) #"Recommendation"
#task = st.sidebar.selectbox('Select Task',["Classification","Recommendation"]) #addfor recom
task = st.sidebar.selectbox('Select Task',["Classification","Recommendation","Sentiment Analysis"]) #addfor sentiment

#feature_type = st.sidebar.selectbox('Select Feature Type',["TFIDF"])
feature_type = st.sidebar.selectbox("Select Feature Type", ["TFIDF", "Word2Vec"]) #add w2v

user_input = st.text_area("enter text for analysis:",height =150)

if st.button("Analyze"):
  if user_input:
      cleaned_input  = preprocess_text(user_input)
      tokenized_input = word_tokenize(cleaned_input)#add w2v

      if feature_type == "TFIDF":
          X_input = tfidf_vectorizer.transform([cleaned_input])
          lr_model = lr_model_tfidf
          gbc_model = gbc_model_tfidf # add gbc
      elif feature_type == "Word2Vec":  # add w2v blk
            X_input = np.array([get_word2vec_embeddings(tokenized_input, w2v_model)])
            lr_model = lr_model_word2vec
            gbc_model = gbc_model_word2vec # add gbc

      if task ==  "Classification":
          lr_pred = lr_model.predict(X_input)[0]

          print(lr_pred)
          st.subheader('Results:')
          st.write(f'Logistic Regfression Prediction: {tgt_names[lr_pred]}')
          gbc_pred = gbc_model.predict(X_input)[0] #addfor gbc
          st.write(f'GBC Prediction: {tgt_names[gbc_pred]}') #addfor gbc
      elif task == "Recommendation": #addfor recom  blk
          sim_scores = cosine_similarity(X_input, X_features[feature_type.lower()])
          top_indices = sim_scores[0].argsort()[-5:][::-1]
          st.subheader("Top 5 Similar Documents")
          for idx in top_indices:
              st.write(f"**Category**: {data['category_name'][idx]}")
              st.write(f"**Text**: {data['text'][idx]}") #[:200]}...")
              st.write("---")
      elif task == "Sentiment Analysis": #addfor sentiment block
          sentiment = TextBlob(cleaned_input).sentiment.polarity
          st.subheader("Sentiment Analysis Result")
          st.write(f"Sentiment Polarity: {sentiment:.4f}")
          st.write("Positive" if sentiment > 0 else "Negative" if sentiment < 0 else "Neutral")
  else:
    st.error('Please enter some text')




Overwriting app.py


In [None]:
# 34.9.13.55

In [None]:
!wget -q -O - ipv4.icanhazip.com

34.80.200.39


In [None]:
!streamlit run app.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[1G[0K⠙[1G[0K⠹[1G[0K⠸[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.80.200.39:8501[0m
[0m
[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0Kyour url is: https://nice-mugs-notice.loca.lt
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_da