In [1]:
from collections import OrderedDict
import matplotlib.pyplot as plt
import seaborn as sns
from lxml import etree
import pickle
import numpy as np
import os
import re
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import math
import csv

stop_words = set(stopwords.words('english')) 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Paths
# Provide path to tagfile, SO data-folder and output_location
datamap=OrderedDict()
datamap["AE"]="C:/Users/harshitgujral/Desktop/Stack/data/android.stackexchange.com/Posts.xml"
datamap["DB"]="C:/Users/harshitgujral/Desktop/Stack/data/dba.stackexchange.com/Posts.xml"
datamap["SO"]="C:/Users/harshitgujral/Desktop/Stack/data/stackoverflow.com-Posts/Posts.xml"

datamap["SE"]="C:/Users/harshitgujral/Desktop/Stack/data/softwareengineering.stackexchange.com/Posts.xml"
datamap["SF"]="C:/Users/harshitgujral/Desktop/Stack/data/serverfault.com/Posts.xml"
datamap["SU"]="C:/Users/harshitgujral/Desktop/Stack/data/superuser.com/Posts.xml"



In [3]:
def read_prog_lang(filename):
    data=OrderedDict()
    f=open(filename,"r")
    for l in f:
        row=l.split("\t")
        key=row[0].lower().strip()
        data[key]=[]
        for element in row[1].split(","):
            data[key].append(element.strip().lower())
    return data

def atleast_one(a, b):
    return not set(a).isdisjoint(b)

def refine_tags(tags):
    if tags!=None:
        l=tags.split('><')
        l=[i.replace('>','').replace('<','').lower() for i in l]
        return l
    else:
        return ["???"]
    
def read_tags(filename):
    logtags=[]
    fn=open(filename,"r")
    next(fn)   # SKipping the first line
    for row in fn:
        logtags.append(row.split(",")[0].strip().lower())   
    return logtags

def intersect(a, b):
    return list(set(a) & set(b))

In [4]:
tagfile="tags/Logging-topic-modelling - Sheet3.csv"
output_location="output/"

prog_tags=read_prog_lang("tags/logging-programming - Sheet1.tsv")
general_tags=list(set(read_tags(tagfile)))
for i in prog_tags:
    general_tags.extend(prog_tags[i])
general_tags=list(set(general_tags))
langs=[i for i in prog_tags.keys()]

In [5]:
langs.remove('android')
langs

['java', 'c', 'c++', 'python', 'c#', 'javascript']

In [6]:
path_lookup = {}
for lang in langs:
    path = "programming_language/{}".format(lang)
    path_lookup[lang] = path

In [7]:
def make_word_set(path):
    word_set = set()
    tokenized_document = {}
    tokenized_document_set = {}
    documents = os.listdir(path)
    for document_name in documents:
        f = open("{}/{}".format(path, document_name), "r")
        document = f.read().lower()
        tokens = [i for i in word_tokenize(document) if i not in stop_words and not i.isdigit()]
        tokenized_document[document_name] = tokens
        tokenized_document_set[document_name] = set(tokens)
        f.close()
        for word in tokens:
            if word not in stop_words and not word.isdigit():
                word_set.add(word) 
    return tokenized_document, tokenized_document_set, word_set, len(documents)

def check_times_exists_in_documents(tokenized_document_set, word):
    df = 0
    for document_name in tokenized_document_set:
        if word in tokenized_document_set[document_name]:
            df+=1
    return df

def write_to_csv(filename, header, rows):
    f= open(filename, "w", newline = '')
    writer = csv.writer(f)
    writer.writerow(header)
    for row in rows:
        writer.writerow(row)
    f.close()

In [8]:
for lang in langs:
    path = path_lookup[lang]
    tokenized_document, tokenized_document_set, word_set, N = make_word_set(path)

    DF = {}
    for word in word_set:
        DF[word] = check_times_exists_in_documents(tokenized_document_set, word)
    print("DF created for {}".format(lang))
    result_header = ['Document name', 'Term','Term frequency (TF)', 'Document frequency (DF)','Number of documents (N)',
                     'Number of times term occurred in the document', 'Number of words in the document','TF-IDF-Score']
    results = []
    for document_name in tokenized_document:
        document = tokenized_document[document_name]
        document_set = tokenized_document_set[document_name]
        number_of_words = len(document)
        for word in word_set:
            if word in document_set:
                number_of_times = document.count(word)
                tf = number_of_times/number_of_words
                df = DF[word]
                tf_idf = tf*math.log(N/df)
                results.append([document_name, word, tf, df, N,number_of_times,number_of_words,  tf_idf])  
    print("results created for {}".format(lang))
    filename="{}/{}.csv".format("tf_idf", lang)            
    write_to_csv(filename, result_header, results)

DF created for java
results created for java
DF created for c
results created for c
DF created for c++
results created for c++
DF created for python
results created for python
DF created for c#
results created for c#
DF created for javascript
results created for javascript
