In [1]:
### Marjoriikka Ylisiurua University of Helsinki 28 Sep 2019
# CSSS summer school report:
# for modeling:
# get tf-idf transformations for various Suomi24-document types
# includes superfluous code

import logging
import os
import numpy as np
from gensim import corpora, models, similarities, utils
from gensim.corpora import TextCorpus, MmCorpus, Dictionary
from collections import defaultdict

from time import time

import csv

import pandas as pd
import nltk
import nltk.data
from nltk import snowball
from nltk.corpus import stopwords

## to use sklearn instead of gensim

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

## init

logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

processedText = []

sampleData = True  # kiikku: analysing sample (True) or whole data set (False)
existingLemmaset = False  # kiikku: using existing dictionary and corpus (True) or creating from scratch (False)

## init file directories
if sampleData:
    searchDirectorySample = ("/Users/mry/SFI/")
    outcomeSampleDir = ("/Users/mry/SFI/")
    outcomeGensimSampleDir = ("/Users/mry/SFI/")  # gensim output filepath, note only one slash / !!
    openFileName = "/Users/mry/SFI/sampleEconomicLemmas.csv"  ## original csv file with lemmatized texts
    saveFileName = "/Users/mry/SFI/sampleEconomicDict.csv" ## save dictionary with word ids
else:
    searchDirectory = ("/Users/mry/SFI/")
    outcomeDir = ("/Users/mry/SFI/")
    outcomeGensimDir = ("/Users/mry/SFI/")  # gensim outcome filepath
    openFileName = "/Users/mry/SFI/SampleData/headEconomy.txt" ## original csv file with lemmatized texts
    saveFileName = "/Users/mry/SFI/SampleData/headEconomy.csv" ## save dictionary with word ids

    ## init some document level selection parameters
if sampleData:
    forumNumber = 6  # which forum to search
    appearance = 0  # how many times a word must be present to be included in the dictionary
    howLarge = 100  # size of document chunks
else:
    forumNumber = 150  # which forum to search
    appearance = 10  # how many times a word must be present to be included in the dictionary
    howLarge = 50000  # size of document chunks



In [2]:
## class MyCorpus(object):
# initialize corpus with dictionary and top directory
# then read all files in the top dir and preprocess each
class MyCorpus(object):
    def __init__(self, top_dir):
        # self = corpora.MmCorpus("/Users/mry/SFI/corpus.mm")
        self.top_dir = top_dir
        self.dictionary = Dictionary(iter_documents(top_dir))

    def __iter__(self):
        for line in iter_documents(self.top_dir):
            # print("\n line")
            # print(line)
            yield preProcess(line)

In [3]:
## def preProcess(documents):
# take a list of paragraph-strings
# tokenize each paragraph-string, lowercase, clean stopwords
# stem/lemmatize
def preProcess(documents):
    ## init
    tokens = []
    filtered_tokens = []
    tokens_stopwords = []
    tokens_stemmed = []

    # remove trash from text
    # especially consider the bot ads
    # stopSigns = "/p p a quot > < ( ) ! , . : ; & ? * NUOLI + [ ] ... / # '' -- -blank /a “ ” http paypal"
    stopSigns = "/p p a quot > < ( ) ! , . : ; & ? * NUOLI + [ ] ... / # '' -- -blank /a “ ” this message has been removed by"
    stoplist = [w for w in stopSigns.split()]
    stoplist = stoplist + (stopwords.words("finnish"))

    ## tokenize the sentence paragraphs & edit

    tokens = [nltk.word_tokenize(el) for el in documents]  # el = string paragraph element in list
    # print("\n tokenized document")
    # print(tokens)

    # lowercase the tokens and clean the documents of stopwords
    for text in tokens:
        filtered_tokens = [word.lower() for word in text if not word.lower() in stoplist]
        # print("\n tokenized lowercase text, stopwords removed")
        # print(sorted(filtered_tokens)) # print each tokenized document paragraph
        tokens_stopwords.append(filtered_tokens)

    # print("\n cleaned texts in a list by element")
    # print(sorted(tokens_stopwords)) # print the whole set of tokenized texts before further filtering

    # if not using lemmatized word list from Korp
    # for each token in list, stem all remaining words

    tokens_stemmed = tokens_stopwords

    # print("\n stemmed text")
    # print(sorted(tokens_stemmed))

    # returned list of tokens
    return tokens_stemmed

In [4]:
## def weedTokens(tokens_stemmed):
# weed token list just to include those who appear more than APPEARANCE # of times
# encode token strings
def weedTokens(tokens_stemmed):
    print("\nweeding tokens, including only those that appear at least " + str(appearance) + " times")

    ## init
    frequency = defaultdict(int)
    tokens_enough = []

    ## count appearance of tokens
    for token_list in tokens_stemmed:
        # print("\n token list in tokens stemmed")
        # print(token_list)
        for token in token_list:
            if token:
                frequency[token] += 1

    # discard words that only appear APPEARANCE amount of times and return the rest
    # encode the strings to utf-8 at the same time
    for tokens in tokens_stemmed:
        filtered_tokens = [bytes(token, "utf-8") for token in tokens if frequency[token] > appearance]
        tokens_enough.append(filtered_tokens)

    # print(tokens_enough)

    return tokens_enough

In [5]:
## def transformation(corpus,vec):
# transformation NOT WORKING IF MATRIX TOO SPARSE?
# def transformation(corpus,vec):
# init transformation that is used to convert the corpus from one vector representation to another
# tfIdf is simple transformation
# it takes documents represented as bags of words counts and applies a weighting
# which discounts common terms (promotes rare terms)
# tfidf scales the resulting vector to unit length, in the Euclidean norm
#    tfidf = models.TfidfModel(corpus)
#    print("\n tfidf-transformation for corpus: format - (ID, #)")
#    print(tfidf[vec])

# transform a corpus and index it
#    index = similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=2)

# query the similarity of query vector against every document in the corpus
#    sims = index[tfidf[vec]]
#    print("\n Similarity vector: format - (sample document x corpus stem ID, similarity score %)")
#    print(sims)
#    return(list(enumerate(sims)))

#  -> which document is most similar to our vector, what is its score

In [6]:
## main
if __name__ == "__main__":
    # set working directory
    if sampleData:
        print("sample")
        os.chdir(searchDirectorySample)
    else:
        printt("full data")
        os.chdir(searchDirectory)

    ## if starting from scratch, create new dic and corpus file
    if not existingLemmaset:

        if sampleData:
            
            ##for pilot data:
            
            # this small sample worked fine
            #texts = ["nyt teksti tulla teksti numero yksi", "sitten numero kaksi", "tulla kolmas", lopuksi viides teksti ja kuudes ja loppu"]
            
            # this small sample is not working; here the document number is known (key is thread_id)
            #texts = {
            #    "doc1":"nyt teksti tulla teksti numero yksi", "doc2":"sitten numero kaksi", "doc3":"tulla kolmas", "doc4":"lopuksi viides teksti ja kuudes ja loppu"
            #}
            
            # this is not working either; this is the outcome from below, the dictionary created from the full file
            # the idea is that one thread has many comments and they are all appended into the value list under the thread_id key
            #texts = {
            #    "doc1":["nyt teksti tulla teksti numero yksi", "lista arvo toinen"], "doc2":["sitten numero kaksi"], "doc3":["tulla kolmas", "vielä neljäs"], "doc4":["lopuksi viides teksti ja kuudes ja loppu"]
            #}
            
            # for tdfif_vectorizer, corpus should be a list of strings 
            # ie. I'm hoping to read all comments in one thread into a row in the list. 
            # these are values from the dictionary so that the dictionary keeps the thread_ids and the original comments intact
            
            ##for small data:
            ## open one file with rows of data including lemmatized sentences in one cell
            with open(openFileName, "r", encoding="utf-8") as openFile:
                documents = csv.DictReader(openFile, delimiter=",") #\t for txt , for csv
                texts = defaultdict(list)
                for row in documents:
                    texts[row["tid"]].append(row["lemmas"])
                #print(texts) 
                
        
            # I try to do it like this so that we could eg. save the dictionary here and use it later 
            # in order to create further corpuses where the documents are different combinations of comments
            # eg. version two, document is a full subforum and corpus is the collection of these subforums
          
        else:
            # print("\nfollowing files found in directory: " + str(os.getcwd()))
            ## open one file with rows of data including lemmatized sentences in one cell
            with open(openFileName, "r", encoding="utf-8") as openFile:  # vai latin1
                documents = csv.DictReader(openFile, delimiter=",") #\t for txt , for csv
                texts = defaultdict(list)
                for row in documents:
                    texts[row["tid"]].append(row["lemmas"])
                #print(texts) 
            
        #after I have the corpus as dictionary, i need to pick all the values from the lists into one string row 
        #that then becomes the corpus proper
              
        ## init scikit-learn vectorizer
        # returns term-document matrix; equivalent to fit followed by transform, but more effective
        # loses the link to document ids, as dict is not ordered. this is why separate dictionary with doc ids as keys seems necessary
            
        vectorizer = TfidfVectorizer()
            
        # for tdfif_vectorizer, corpus should be a list of strings 
        # first trial: each thread = document, all comments concatenated into one string
        # second trial: each sub-discussion forum level = document, all comments concatenated into one string
            
        # this is no working for the actual dictionary data set
        #corp = vectorizer.fit_transform(key.values() for key, value in texts.items())
        
        print("BEGIN GENSIM - PREPROCESS")
        # this works for the simplest data set that is only a list
        texts = ["nyt teksti tulla teksti numero yksi", "sitten numero kaksi", "tulla kolmas", "lopuksi viides teksti ja kuudes ja loppu"]
            
        # preprocess stop words et al.
        processedText = processedText + preProcess(texts)
        
        # weed the text if you want to minimize the dictionary by removing words that are too rare
        # recommended: five words, already does a lot but it depends a bit on the data size
        print("\npre-processed text weeded next")
        processedText = weedTokens(processedText)
        print(processedText)
        
        #############
        ## here's how i did it earlier with gensim.corpora
        # create dictionary and save it for future use as .dic
        # do this every time you update your document collection
        dic = corpora.Dictionary(processedText)
        
        print("CONTINUE GENSIM - DIC AND CORPUS")
        print(dic)
        print("\ndictionary of documents: format - (stem, ID)")
        print(dic.token2id)
        #if sampleData:
        #    with open(saveFileName, "w") as saveFile:
        #        writer = csv.DictWriter(saveFile, dic.keys())
        #        #writer.writeheader()
        #        writer.writerows(dic.token2id)
        #else:
        #    dic.save(outcomeGensimDir + "dic.dic")  # Store the dictionary for future reference
        #
        #
        # create corpus and save it for future use
        # this gensim corpus resides fully in RAM memory as plain python list
        # with very large datasets this won't do and its preferable to access each document from a file
        # this is why I try to replace this simple solution with the scikit dictionary
        corpus = [dic.doc2bow(text) for text in processedText]
        if sampleData:
            corpora.MmCorpus.serialize(outcomeGensimSampleDir + "corpus sample.mm", corpus)
        else:
            corpora.MmCorpus.serialize(outcomeGensimDir + "corpus.mm", corpus)
        print("\ncorpus of documents: format - (ID, # of occurrences)")
        print(corpus)
            
        ###############
        ## here's how I tried to redo it with scikit learn
        print("BEGIN SCIKIT LEARN")
        corp = vectorizer.fit_transform(texts)
            
        # This part you want saved into csv/txt-file; first, document ID (eventually, thread_id), then word ID
        # not working
        print("save this one:")
        a = corp.toarray()
        np.savetxt("foo.csv", a, delimiter=";")
        print(corp.shape)
        print("saved!")

        # this part you want saved into other csv/txt-file: first, word ID, then word tfidf
        # not working
        print("Extracting features from the data:")
        feature_names = vectorizer.get_feature_names()
        print("feature names:")
        print(feature_names)
        feature_names = np.asarray(feature_names)
            
        print(feature_names)
        
        print("COMPARE PANDAS")
        # trying things out
        df2 = pd.DataFrame(corp)
        print(df2.head())
        #sparse to dense version (don't use for large vocabulary)
        #corpus_index = [n for n in texts]
        #dense = corp.todense()
        #denselist = dense.tolist()
        #df = pd.DataFrame(denselist, columns = feature_names, index = corpus_index)
        #print(df.head())
    


2019-10-09 10:33:15,057 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-10-09 10:33:15,058 : INFO : built Dictionary(11 unique tokens: ['numero', 'teksti', 'tulla', 'yksi', 'kaksi']...) from 4 documents (total 15 corpus positions)
2019-10-09 10:33:15,058 : INFO : storing corpus in Matrix Market format to /Users/mry/SFI/corpus sample.mm
2019-10-09 10:33:15,061 : INFO : saving sparse matrix to /Users/mry/SFI/corpus sample.mm
2019-10-09 10:33:15,061 : INFO : PROGRESS: saving document #0
2019-10-09 10:33:15,062 : INFO : saved 4x11 matrix, density=31.818% (14/44)
2019-10-09 10:33:15,063 : INFO : saving MmCorpus index to /Users/mry/SFI/corpus sample.mm.index


sample
defaultdict(<class 'list'>, {'123': ['nyt teksti tulla teksti numero yksi', 'sitten numero kaksi'], '200': ['tulla kolmas', 'lopuksi viides teksti ja kuudes ja loppu'], '35': ['sitten teksti talous'], '42': ['jos teksti toinen talous', 'ehkä teksti talous vielä']})
BEGIN GENSIM - PREPROCESS

pre-processed text weeded next

weeding tokens, including only those that appear at least 0 times
[[b'teksti', b'tulla', b'teksti', b'numero', b'yksi'], [b'sitten', b'numero', b'kaksi'], [b'tulla', b'kolmas'], [b'lopuksi', b'viides', b'teksti', b'kuudes', b'loppu']]
CONTINUE GENSIM - DIC AND CORPUS
Dictionary(11 unique tokens: ['numero', 'teksti', 'tulla', 'yksi', 'kaksi']...)

dictionary of documents: format - (stem, ID)
{'numero': 0, 'teksti': 1, 'tulla': 2, 'yksi': 3, 'kaksi': 4, 'sitten': 5, 'kolmas': 6, 'kuudes': 7, 'loppu': 8, 'lopuksi': 9, 'viides': 10}

corpus of documents: format - (ID, # of occurrences)
[[(0, 1), (1, 2), (2, 1), (3, 1)], [(0, 1), (4, 1), (5, 1)], [(2, 1), (6, 1)], 