In [2]:
!pip install small_text

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting small_text
  Downloading small_text-1.1.1-py3-none-any.whl (178 kB)
[K     |████████████████████████████████| 178 kB 4.8 MB/s 
Installing collected packages: small-text
Successfully installed small-text-1.1.1


In [5]:
#Import required packages...............................................................................................
import numpy as np
import pandas as pd
import re
from small_text.base import LABEL_UNLABELED
from scipy import sparse as sp
from sklearn.pipeline import Pipeline
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.metrics import accuracy_score
import nltk

In [6]:
#Describe the data set movie_reviews....................................................................................
nltk.download('movie_reviews')
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


2000
['neg', 'pos']


In [9]:
#Data Pre Processing..................................................................................
nltk.download('stopwords')
def dataframe_data(movie_reviews):
    #convert corpus to Pandas DataFrame
    df=[]
    for fileid in movie_reviews.fileids():
        tag,filename=fileid.split('/')
        df.append((filename,tag,movie_reviews.raw(fileid)))
    df1=pd.DataFrame(df,columns=['fileId', 'Class', 'TextData'])
    documents=df1.iloc[:,1:]
    #Convert class values to Numeric type data
    lenc=LabelEncoder()
    documents.iloc[:,0]=lenc.fit_transform(documents['Class'])
    return documents

def define_transformers(n_of_top_features):
    #Define the stop words
    mystopwords = set(stopwords.words("english"))
    top_k_features = n_of_top_features
    text_processor = Pipeline([
        ('count vectorizer', CountVectorizer(stop_words=mystopwords, lowercase=True)),
        ('chi2score', SelectKBest(chi2, k=top_k_features)),
        ('tf_transformer', TfidfTransformer(use_idf=True))
    ])
    return  text_processor

def data_cleaning(documents):
    clean_txt = []
    for w in range(len(documents['TextData'])):
        desc = documents['TextData'][w].lower()
        desc = re.sub('[^a-zA-Z]', ' ', desc)
        desc = re.sub("&lt;/?.*?&gt;", " &lt;&gt; ", desc)
        desc = re.sub("(\\d|\\W)+", " ", desc)
        clean_txt.append(desc)
    documents['clean'] = clean_txt
    return documents

def transform_cleanData(documents,text_processor):
    proc_text = pd.DataFrame(text_processor.fit_transform(documents['clean'], documents['Class']).todense())
    #proc_fit=text_processor.fit(documents['clean'], documents['Class'])
    #create unlabeled data
    def createList(size, value):
        requiredlist = [value] * size
        return requiredlist
    lbl_p=documents.query("Class == 1").sample(n=250)
    lbl_n=documents.query("Class == 0").sample(n=250)
    lbl_data=pd.concat([lbl_n, lbl_p], axis=0)
    unlb_data=documents.drop(lbl_data.index)
    unlabeled_data = createList(1500, -1)
    unlb_data['Class'] = unlabeled_data
    new_data=pd.concat([lbl_data,unlb_data],axis=0)
    prepared_data = pd.concat([proc_text, new_data['Class']], axis=1)
    return proc_text,prepared_data

def label_unlabel_data(preapredData):
    #seperate labeled and unlabeled data
    pos=preapredData.query("Class == 1 ")
    neg=preapredData.query("Class == 0 ")
    labeled=pd.concat([pos,neg],axis=0)
    unlabeled=preapredData.drop(labeled.index)
    return labeled,unlabeled


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
#Models................................................................................................................
def use_SVC(x_train,x_test,y_train,y_test):
    model1=SVC(kernel='rbf')
    model2=SVC(kernel='linear')
    model1.fit(x_train,y_train)
    model2.fit(x_train,y_train)
    pred1=model1.predict(x_test)
    pred2=model2.predict(x_test)
    a1=accuracy_score(y_test,pred1)
    a2=accuracy_score(y_test,pred2)
    return model1,model2,a1,a2

#Query for new data from Unlabeled dataset..............................................................................
def get_data(unlabeled,a):
    qureied = unlabeled.sample(n=a)
    return qureied

#Update our labeled data................................................................................................
def update_lbl_data(m,qdata,lbld):
    pred = m.predict(qdata.iloc[:, :-1])
    dfc = m.decision_function(qdata.iloc[:, :-1])
    qdata['pred'] = pred
    qdata['DFn'] = dfc
    a = qdata[qdata['DFn'] > -0.5]
    b = qdata[qdata['DFn'] < 0]
    ss = list(set(a.DFn) & set(b.DFn))
    d = qdata[qdata['DFn'] > 0.5]
    neg = qdata[qdata['DFn'].isin(ss)]
    join = pd.concat([neg, d], axis=0)
    #print("Predicted as Positove Class or '1':")
    #print(d)
    #print("Predicted as Negative Class or '0':")
    #print(neg)
    #print("Labeled data:")
    #print(join)
    join.Class = join.pred
    new_lbl = join.iloc[:, :-2]
    updated = pd.concat([lbld, new_lbl], axis=0)
    return updated,new_lbl

def update_ulb(ulb,newidx):
    idx = np.array(newidx.index)
    ulb = ulb.drop(index=idx)
    return ulb


In [17]:
documents = dataframe_data(movie_reviews)
text_processor = define_transformers(500)
cleaned_data = data_cleaning(documents)
proc_text, prepared_data = transform_cleanData(cleaned_data, text_processor)
print(proc_text)
print(prepared_data)
labeled, unlabeled = label_unlabel_data(prepared_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[selected_item_labels] = value


In [19]:
iteration = 0
while len(unlabeled)>=500:
  x_train, x_test, y_train, y_test = train_test_split(labeled.iloc[:, :-1], labeled.iloc[:, -1])
  model1, model2, acc1, acc2 = use_SVC(x_train, x_test, y_train, y_test)
  #print("Accuracy for SVC model using RBF kernel: ", acc1)
  #print("Accuracy for SVC model using Linear kernel: ", acc2)
  queried_data = get_data(unlabeled, 60)
  # labled=update_lbl_data(model1,queried_data,labled)
  # print("Labeled with SVC model and RBF kernel Function: ")
  # print(labled)
  labeled, newlbl = update_lbl_data(model2, queried_data, labeled)
  unlabeled = update_ulb(unlabeled, newlbl)
  #print("Labeled with SVC model and Linear kernel Function: ")
  #print(labled)
  #print("New Set of Unlabeled data: ")
  #print(unlabeled)
  iteration = iteration + 1
  print("iteration: ", iteration)
      

iteration:  1
iteration:  2
iteration:  3
iteration:  4
iteration:  5
iteration:  6
iteration:  7
iteration:  8
iteration:  9
iteration:  10
iteration:  11
iteration:  12
iteration:  13
iteration:  14
iteration:  15
iteration:  16
iteration:  17
iteration:  18
iteration:  19
iteration:  20
iteration:  21
iteration:  22
iteration:  23
iteration:  24
iteration:  25
iteration:  26
iteration:  27
iteration:  28
iteration:  29
iteration:  30
iteration:  31
iteration:  32
iteration:  33
iteration:  34
iteration:  35
iteration:  36
iteration:  37
iteration:  38
iteration:  39
iteration:  40
iteration:  41
iteration:  42
iteration:  43
iteration:  44
iteration:  45
iteration:  46
iteration:  47
iteration:  48
iteration:  49
iteration:  50
iteration:  51
iteration:  52
iteration:  53
iteration:  54
iteration:  55
iteration:  56
iteration:  57
iteration:  58
iteration:  59
iteration:  60
iteration:  61
iteration:  62
iteration:  63
iteration:  64
iteration:  65
iteration:  66
iteration:  67
iter

In [20]:
print(labeled)

        0    1    2    3    4         5         6    7         8    9  ...  \
1012  0.0  0.0  0.0  0.0  0.0  0.117402  0.168458  0.0  0.000000  0.0  ...   
1018  0.0  0.0  0.0  0.0  0.0  0.152531  0.109431  0.0  0.000000  0.0  ...   
1024  0.0  0.0  0.0  0.0  0.0  0.146546  0.000000  0.0  0.000000  0.0  ...   
1025  0.0  0.0  0.0  0.0  0.0  0.048970  0.000000  0.0  0.000000  0.0  ...   
1028  0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.082701  0.0  ...   
...   ...  ...  ...  ...  ...       ...       ...  ...       ...  ...  ...   
1105  0.0  0.0  0.0  0.0  0.0  0.034429  0.000000  0.0  0.000000  0.0  ...   
1634  0.0  0.0  0.0  0.0  0.0  0.018213  0.000000  0.0  0.000000  0.0  ...   
638   0.0  0.0  0.0  0.0  0.0  0.068691  0.000000  0.0  0.000000  0.0  ...   
878   0.0  0.0  0.0  0.0  0.0  0.232649  0.083456  0.0  0.000000  0.0  ...   
792   0.0  0.0  0.0  0.0  0.0  0.000000  0.000000  0.0  0.000000  0.0  ...   

           491       492       493       494  495       496    

In [21]:
print(unlabeled)

        0    1    2         3    4         5         6    7         8    9  \
1     0.0  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.0   
2     0.0  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.0  0.000000  0.0   
3     0.0  0.0  0.0  0.000000  0.0  0.000000  0.000000  0.0  0.108199  0.0   
4     0.0  0.0  0.0  0.000000  0.0  0.087561  0.000000  0.0  0.135205  0.0   
7     0.0  0.0  0.0  0.000000  0.0  0.046322  0.000000  0.0  0.000000  0.0   
...   ...  ...  ...       ...  ...       ...       ...  ...       ...  ...   
1932  0.0  0.0  0.0  0.000000  0.0  0.066871  0.000000  0.0  0.206513  0.0   
1937  0.0  0.0  0.0  0.000000  0.0  0.090432  0.389275  0.0  0.000000  0.0   
1952  0.0  0.0  0.0  0.000000  0.0  0.150128  0.000000  0.0  0.000000  0.0   
1963  0.0  0.0  0.0  0.000000  0.0  0.056150  0.000000  0.0  0.000000  0.0   
1986  0.0  0.0  0.0  0.093208  0.0  0.037881  0.000000  0.0  0.116985  0.0   

      ...       491  492       493       494  495  496       49