In [22]:
# Importing the required libraries.
import numpy as np
import pickle, zlib
from random import sample
import scipy.cluster.hierarchy as sch
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim import corpora

import pandas as pd
import matplotlib.pyplot as plt

import gensim
# print(gensim.__version__)    # Collecting the article_ids, and corresponding article_vectors for each class.


3.6.0


In [3]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive/',force_remount=True)

Mounted at /content/drive/


In [4]:
# Emp and growth types
emp_type = ['Unemp', 'Agri', 'Non Agri']
N_EMP = 3
growth_type = ['Slow','Average','Fast']
N_GROWTH = 3

def ClassifyNew(df, model, MRV_Emp, MRVs):
    PredEmp = []
    PredGrowth = []
    Outlier = []
    for index, row in df.iterrows():
        
        txt = row['Keywords']   # Extract (pre processed) txt
        actualEmp = row['Emp']  # Extract emp type

        inferred = model.infer_vector(txt,alpha=0.1,epochs=100) # Inferring vector for this article

        # Pred Emp type
        cs = np.array([cosine_similarity(inferred.reshape(1,-1),temp.reshape(1,-1)) for temp in MRV_Emp]).reshape((N_EMP,1))
        emp = np.argmax(cs)
        PredEmp.append(emp_type[emp])

        #Pred Pace of Growth 
        actEmp = emp_type.index(actualEmp) #(Calculate on given emp type not predicted)
        cs = np.array([cosine_similarity(inferred.reshape(1,-1),temp.reshape(1,-1)) for temp in MRVs[N_GROWTH*actEmp:N_GROWTH*(actEmp+1)]]).reshape((N_GROWTH,1))
        PredGrowth.append(growth_type[np.argmax(cs)])

    # Add to dataframe
    df['PredEmp'] = PredEmp
    df['PredGrowth'] = PredGrowth
    
    return df

In [19]:
# File paths for datasets and with appropraite folder name
FOLDER = 'drive/My Drive/'
PATHS = {'Test':FOLDER+'Split/Temporal/test_','Train':FOLDER+'Split/Temporal/train_','Model':FOLDER+'Split/Temporal/','Pred':FOLDER+'Split/Temporal/Prediction/Pred_'}

# Datsets and models
datasets = ['dataset_agriculture', 'dataset_development', 'dataset_environment', 'dataset_industrialization', 'dataset_lifestyle']
models = ['model_agriculture', 'model_development', 'model_environment', 'model_industrialization', 'model_lifestyle']

# For testing purposes can select specific datasets to run on
MODE = 'Test'  #Only run on development if Mode == 'Test' else run on all
SELECT = {'dataset_agriculture':1,'dataset_development':1,'dataset_environment':1,'dataset_industrialization':1,'dataset_lifestyle':0}

SAVE = False # Save prediction file or not


for dataset, model in zip(datasets,models):
    if MODE=='Test' and dataset != 'dataset_development':
        continue;
    elif SELECT[dataset]==0:
        continue;

    # Printing the collection name.
    collection_name = dataset[8:]
    print('\nCollection:',collection_name.capitalize())

    # Loading the train,test dataset and the model from the drive.
    file = open(PATHS['Train']+dataset, 'rb')
    train_dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()

    file = open(PATHS['Test']+dataset, 'rb')
    test_dataset = pickle.loads(zlib.decompress(pickle.load(file)))
    file.close()

    model = Doc2Vec.load(PATHS['Model']+model)

    ## -- CALCULATING GLOBAL CENTROID USING THE TRAIN DATASET  --##
    dataset = train_dataset #Shorthand
    # Collecting the article_ids, and corresponding article_vectors for each class.
    temp_ids = [set() for _ in range(9)]
    temp_vectors = [[] for _ in range(9)]
    temp_datasets = [[] for _ in range(9)]
    for i in dataset:
        if i[6]=='Unemp' and i[7]=='Slow':
            if i[0] not in temp_ids[0]:
                temp_ids[0].add(i[0])
                temp_vectors[0].append(model.docvecs[i[0]])
                temp_datasets[0].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Unemp' and i[7]=='Average':
            if i[0] not in temp_ids[1]:
                temp_ids[1].add(i[0])
                temp_vectors[1].append(model.docvecs[i[0]])
                temp_datasets[1].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Unemp' and i[7]=='Fast':
            if i[0] not in temp_ids[2]:
                temp_ids[2].add(i[0])
                temp_vectors[2].append(model.docvecs[i[0]])
                temp_datasets[2].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Agri' and i[7]=='Slow':
            if i[0] not in temp_ids[3]:
                temp_ids[3].add(i[0])
                temp_vectors[3].append(model.docvecs[i[0]])
                temp_datasets[3].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Agri' and i[7]=='Average':
            if i[0] not in temp_ids[4]:
                temp_ids[4].add(i[0])
                temp_vectors[4].append(model.docvecs[i[0]])
                temp_datasets[4].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Agri' and i[7]=='Fast':
            if i[0] not in temp_ids[5]:
                temp_ids[5].add(i[0])
                temp_vectors[5].append(model.docvecs[i[0]])
                temp_datasets[5].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Non Agri' and i[7]=='Slow':
            if i[0] not in temp_ids[6]:
                temp_ids[6].add(i[0])
                temp_vectors[6].append(model.docvecs[i[0]])
                temp_datasets[6].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Non Agri' and i[7]=='Average':
            if i[0] not in temp_ids[7]:
                temp_ids[7].add(i[0])
                temp_vectors[7].append(model.docvecs[i[0]])
                temp_datasets[7].append([i[0],model.docvecs[i[0]]])
        if i[6]=='Non Agri' and i[7]=='Fast':
            if i[0] not in temp_ids[8]:
                temp_ids[8].add(i[0])
                temp_vectors[8].append(model.docvecs[i[0]])
                temp_datasets[8].append([i[0],model.docvecs[i[0]]])

    # Calculate Global Centroid for Emp classes as well as Growth sub classes
    MRV_Emp = [[] for _ in range(3)]
    MRVs=temp_vectors
    for i in range(9):
        MRVs[i] = np.median(MRVs[i],axis=0)
    for i in range(3):
        for j in range(3):
            MRV_Emp[i].append(temp_vectors[3*i+j])
        MRV_Emp[i] = np.median(MRV_Emp[i],axis=0)

    #---X---X--- GLOBAL CENTROID CALCULATED ---X---X---#

    # Set pandas options for printing
    pd.set_option('max_colwidth', 15)
    pd.set_option('max_rows', 10)

    # Create pandas dataframe of the datasets
    # Columns in Dataset --> 0:ArticleId, 1: Title, 2: Text, 3: Keywords(Processed text) 4: Date(YYYY,MM,DD), 5:ID, 6: Emp, 7: POG, 8: (Not used)
    
    # # Train (Not used)
    # df = pd.DataFrame(dataset)
    # df.columns = ['ArticleId','Title','Text','Keywords','Date','DistrictId','Emp','Growth','Type']
    # df=df.drop(['Type'],axis=1)
    # train_df = df;

    # Test
    test_df = pd.DataFrame(test_dataset)
    test_df.columns = ['ArticleId','Title','Text','Keywords','Date','DistrictId','Emp','Growth','Type']
    test_df=test_df.drop(['Type'],axis=1) # Not required


    # Predicting the emp types and pace of growth
    Pred_df = ClassifyNew(test_df, model, MRV_Emp, MRVs)

    # Marking Outliers
    Pred_df['EmpOut'] = Pred_df['Emp']!=Pred_df['PredEmp']              # Emp Outlier  
    Pred_df['GrowthOut'] = Pred_df['Growth']!=Pred_df['PredGrowth']     # Growth Outlier

    # Save Prediction File
    if SAVE:
        file_pred = open(PATHS['Pred']+collection_name,'wb')
        pickle.dump(zlib.compress(pickle.dumps(Pred_df.values.tolist()),pickle.HIGHEST_PROTOCOL),file_pred,pickle.HIGHEST_PROTOCOL)
        file_pred.close()


Collection: Development


NameError: ignored