# Files to Dataframe

In [None]:
import os
import pandas as pd

In [None]:
file_names = os.listdir( r'stories')

In [None]:
fd = {}
for f in file_names:
        with open(r'stories' '\\' + f ,"r",encoding="latin-1") as txtfile:
            fd[f] = txtfile.read().strip()


In [4]:
file_data = (pd.DataFrame.from_dict(fd,orient='index')).reset_index().rename(index=str,columns={'index':'file_name',0:'text'})

In [5]:
file_data.head()

Unnamed: 0,file_name,text
0,100west.txt,THIS IS A SHAREWARE TRIAL PROJECT\n ...
1,13chil.txt,FOR CHILDREN:\n\n ...
2,14.lws,----------------------------------------------...
3,16.lws,----------------------------------------------...
4,17.lws,--------------------------------------------\n...


# Data Cleaning 

In [6]:
temp = file_data['text']

In [7]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [8]:
def clean(list):
    newlist =[]
    for text in list:
        text = text.lower()
        text = re.sub(r"[;:\"~`!@#$%^&*(){}<>+=_-]","",text)
        text = re.sub(r'http[s]://(?:[a-zA-Z]|[0-9]|[!@$&+.*\(\)]|(?:%[0-9a-fA-F][0-9a-fA-f]))+',"",text)
        text = re.sub(r"[\n]","",text)
        text = re.sub(r"[\t]","",text)
        text = re.sub(r"[.,]"," ",text)
        tokens = word_tokenize(text)
        tokens = [token for token in tokens if token.isalpha()]
        stop_words = set(stopwords.words("english"))
        PS = PorterStemmer()
        tokens = [PS.stem(token) for token in tokens if not token in stop_words]
        tokens = " ".join(tokens)
        newlist.append(tokens)
    return newlist
        

In [9]:
tt = clean(temp) 

In [10]:
file_data = file_data.assign(clean_text = tt)

In [11]:
file_data.head()

Unnamed: 0,file_name,text,clean_text
0,100west.txt,THIS IS A SHAREWARE TRIAL PROJECT\n ...,sharewar trial project freewar need support co...
1,13chil.txt,FOR CHILDREN:\n\n ...,children sli fox mr rabbit sat front porch roc...
2,14.lws,----------------------------------------------...,adventur lone wolf scientifican electron syndi...
3,16.lws,----------------------------------------------...,adventur lone wolf scientifican electron syndi...
4,17.lws,--------------------------------------------\n...,adventur lone wolf scientificth adventur lone ...


# Data Processing

In [12]:
vocab = set()
for doc in tt:
    vocab.update(doc.split())

In [13]:
len(vocab)

92060

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(vocabulary = vocab)
X = tfidf.fit_transform(tt)

In [15]:
import numpy as np

In [16]:
np.shape(X)

(453, 92060)

In [17]:
def cosine_sim(a,b):
    cs = np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cs

In [18]:
def cosine_similarity(k,query):
    query = [query]
    p = tfidf.fit_transform(query).toarray()
    cosines=[]
    q = X.A
    for r in range(len(X.A)):        
        c = cosine_sim(p,X.A[r])
        cosines.append(c)
        
    return cosines

# Implementaion and Testing

In [19]:
o = cosine_similarity(10,'overton was evidently considerably excited when he sent it and somewhat incoherent in consequence')

In [20]:
fc = file_data.copy()

In [21]:
fc = fc.assign(cs = o)

In [22]:
fc.sort_values(by=['cs'],ascending=False).head(10)

Unnamed: 0,file_name,text,clean_text,cs
285,missing.txt,:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:.:....,earth dreamland bb text file junki rpgnet gm f...,[0.096540190456066]
229,home.fil,The Following story is by Francis W. Porretto\...,follow stori franci w porrettohors feather bbs...,[0.025534321287683476]
17,7oldsamr.txt,"THE SEVEN OLD SAMURAI\n\n Once upon a time, ...",seven old samurai upon time far japan band fie...,[0.024260216144445797]
149,empsjowk.txt,THE WMPRESS JOWKA\n\n Once upon a time... an...,wmpress jowka upon time empress live japan you...,[0.022880067408350382]
204,gloves.txt,A MISMATCHED PAIR OF GLOVES\n\n\n A young ...,mismatch pair glove young man wish purchas pre...,[0.021406745907884286]
51,bern,THE ADVENTURES OF BERT AND BERNECE\n by Franc...,adventur bert bernec franci u kaltenbaugh midt...,[0.016282922367626365]
144,dwar,DWARF\n by Jeroen van Drie \n\n I take walki...,dwarf jeroen van drie take walk forest much wa...,[0.01607642646958248]
166,fea3,"Survey Results\nCopyright (c) 1994, Joe DeRoue...",survey resultscopyright c joe derouenal right ...,[0.015222437237950852]
377,shrdfarm.txt,A SHREWD FARMER'S STORY\n\n Once upon a time...,shrewd farmer stori upon time live farmer work...,[0.014771046935003493]
392,spectacl.poe,"The Spectacles\n\n Many years ago, it was ...",spectacl mani year ago fashion ridicul idea fi...,[0.014152078878044871]
