# Getting "context chunks" and topic modeling those chunks

This notebook is to find the character start and end indices for a given quotation, then generate a csv file with all items quoting within that range.

## Import

In [1]:
from text_matcher.matcher import Text, Matcher
import json
#import nltk
#nltk.download('stopwords')
import pandas as pd
from IPython.display import clear_output
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = [16, 6]
#pd.set_option('display.max_colwidth', None)

## Define filepath

In [2]:
# ACTION: copy path to results JSONL file here (filename should end "_results_[hyperparameters].jsonl")

startData = "/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data/Watt/1957_RiseNovel/Results/Watt_1957_RiseNovel_results_t2-c3-n2-m3-nostops.jsonl"

In [3]:
# Infer naming variables from path

textTitle = startData.rsplit("_", 4)[-3]
publicationYear = startData.rsplit("_", 4)[-4]
authorSurname = startData.rsplit("_", 4)[-5]
authorSurname = authorSurname.rsplit("/", 1)[-1]
hyperparSuffix = startData.rsplit("_", 4)[-1]
hyperparSuffix = f"_{hyperparSuffix[:-6]}"
dataDir = startData.rsplit("/", 4)[0]

print(f"Author surname: {authorSurname}\nPublication year: {publicationYear}\nText title: {textTitle}\nHyperparameters suffix: {hyperparSuffix}\nData directory:{dataDir}")

projectName = f"{authorSurname}_{publicationYear}_{textTitle}"
sourceDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Source"
corpusDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Corpus"
resultsDir = f"{dataDir}/{authorSurname}/{publicationYear}_{textTitle}/Results"

Author surname: Watt
Publication year: 1957
Text title: RiseNovel
Hyperparameters suffix: _t2-c3-n2-m3-nostops
Data directory:/Users/milan/Library/CloudStorage/GoogleDrive-mtt2126@columbia.edu/My Drive/iAnnotate/MIT/Quotable Content/Data


In [4]:
# Load .txt file 
inputText = f"{sourceDir}/{projectName}_plaintext.txt"
with open(inputText) as f: 
    txt = f.read()

processedText = Text(txt, f'{textTitle}')

In [25]:
# Load in the JSON file with our JSTOR articles and data from TextMatcher
# (Note: must have the file 'default.json' in the same directory as this notebook)
# Load results as pandas dataframe

df = pd.read_json(startData, lines=True)
df

Unnamed: 0,datePublished,docSubType,Year,Decade,docType,doi,id,identifier,isPartOf,issueNumber,...,url,volumeNumber,wordCount,numMatches,Locations in A,Locations in B,creator,abstract,placeOfPublication,subTitle
0,2001-07-01,misc,2001,2000,article,10.2307/1346068,http://www.jstor.org/stable/1346068,"[{'name': 'doi', 'value': '10.2307/1346068'}, ...",NOVEL: A Forum on Fiction,3,...,http://www.jstor.org/stable/1346068,34,3065,1,"[[592927, 592954]]","[[10284, 10318]]",,,,
1,1969-06-01,misc,1969,1960,article,10.2307/2932363,http://www.jstor.org/stable/2932363,"[{'name': 'doi', 'value': '10.2307/2932363'}, ...",Nineteenth-Century Fiction,1,...,http://www.jstor.org/stable/2932363,24,516,0,[],[],,,,
2,1964-12-01,research-article,1964,1960,article,10.2307/3825650,http://www.jstor.org/stable/3825650,"[{'name': 'doi', 'value': '10.2307/3825650'}, ...",Victorian Studies,2,...,http://www.jstor.org/stable/3825650,8,15402,1,"[[54877, 54953]]","[[25944, 26020]]",[George Levine],,,
3,1957-06-01,misc,1957,1950,article,10.2307/2088509,http://www.jstor.org/stable/2088509,"[{'name': 'doi', 'value': '10.2307/2088509'}, ...",American Sociological Review,3,...,http://www.jstor.org/stable/2088509,22,3456,0,[],[],,,,
4,2012-09-01,research-article,2012,2010,article,10.2979/reseafrilite.43.3.1,http://www.jstor.org/stable/10.2979/reseafrili...,"[{'name': 'doi', 'value': '10.2979/reseafrilit...",Research in African Literatures,3,...,http://www.jstor.org/stable/10.2979/reseafrili...,43,10709,2,"[[13006, 13044], [60749, 60916]]","[[22804, 22844], [22962, 23119]]",[Akin Adesokan],ABSTRACT Postcolonial novels that tend to beco...,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6817,1997-09-01,misc,1997,1990,article,10.2307/899971,http://www.jstor.org/stable/899971,"[{'name': 'doi', 'value': '10.2307/899971'}, {...",Notes,1,...,http://www.jstor.org/stable/899971,54,51912,1,"[[221, 250]]","[[52471, 52500]]","[Paul Cauthen, Mark Palkovic]",,,
6818,2003-04-01,book-review,2003,2000,article,10.2307/23540422,http://www.jstor.org/stable/23540422,"[{'name': 'doi', 'value': '10.2307/23540422'},...",Biography,2,...,http://www.jstor.org/stable/23540422,26,19697,0,[],[],,,,
6819,2017-10-01,research-article,2017,2010,article,10.2307/26793237,http://www.jstor.org/stable/26793237,"[{'name': 'doi', 'value': '10.2307/26793237'},...",The English Historical Review,558,...,http://www.jstor.org/stable/26793237,132,48757,1,"[[331741, 331794]]","[[16074, 16127]]",,,,
6820,2017-04-01,research-article,2017,2010,article,10.2307/90001119,http://www.jstor.org/stable/90001119,"[{'name': 'doi', 'value': '10.2307/90001119'},...",The Eighteenth Century,1,...,http://www.jstor.org/stable/90001119,58,8914,1,"[[51039, 51075]]","[[5612, 5648]]",[Thomas Salem Manganaro],ABSTRACT This essay examines the case of actin...,,


In [26]:
# Drop items with no matches from dataframe

df = df[df['numMatches']>=1]
df

Unnamed: 0,datePublished,docSubType,Year,Decade,docType,doi,id,identifier,isPartOf,issueNumber,...,url,volumeNumber,wordCount,numMatches,Locations in A,Locations in B,creator,abstract,placeOfPublication,subTitle
0,2001-07-01,misc,2001,2000,article,10.2307/1346068,http://www.jstor.org/stable/1346068,"[{'name': 'doi', 'value': '10.2307/1346068'}, ...",NOVEL: A Forum on Fiction,3,...,http://www.jstor.org/stable/1346068,34,3065,1,"[[592927, 592954]]","[[10284, 10318]]",,,,
2,1964-12-01,research-article,1964,1960,article,10.2307/3825650,http://www.jstor.org/stable/3825650,"[{'name': 'doi', 'value': '10.2307/3825650'}, ...",Victorian Studies,2,...,http://www.jstor.org/stable/3825650,8,15402,1,"[[54877, 54953]]","[[25944, 26020]]",[George Levine],,,
4,2012-09-01,research-article,2012,2010,article,10.2979/reseafrilite.43.3.1,http://www.jstor.org/stable/10.2979/reseafrili...,"[{'name': 'doi', 'value': '10.2979/reseafrilit...",Research in African Literatures,3,...,http://www.jstor.org/stable/10.2979/reseafrili...,43,10709,2,"[[13006, 13044], [60749, 60916]]","[[22804, 22844], [22962, 23119]]",[Akin Adesokan],ABSTRACT Postcolonial novels that tend to beco...,,
5,1997-07-01,research-article,1997,1990,article,10.2307/23041237,http://www.jstor.org/stable/23041237,"[{'name': 'doi', 'value': '10.2307/23041237'},...",Hermathena,162/163,...,http://www.jstor.org/stable/23041237,,111927,1,"[[346867, 346904]]","[[286873, 286916]]",[A. H. F. Griffin],,,
6,2023-01-01,book,2023,2020,book,10.3998/mpub.11738099,http://www.jstor.org/stable/10.3998/mpub.11738099,"[{'name': 'doi', 'value': '10.3998/mpub.117380...",,,...,http://www.jstor.org/stable/10.3998/mpub.11738099,,96697,1,"[[5319, 5344]]","[[215144, 215166]]",[Joseph B. Entin],,Ann Arbor,"Fiction, Film, and Precarious Work"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6815,2023-01-01,book,2023,2020,book,10.2307/jj.5076351,http://www.jstor.org/stable/jj.5076351,"[{'name': 'doi', 'value': '10.2307/jj.5076351'...",,,...,http://www.jstor.org/stable/jj.5076351,,58467,1,"[[331741, 331785]]","[[25385, 25429]]",[Shane Ewen],,LONDON,"Fire, Safety and Deregulation in Twentieth-Cen..."
6817,1997-09-01,misc,1997,1990,article,10.2307/899971,http://www.jstor.org/stable/899971,"[{'name': 'doi', 'value': '10.2307/899971'}, {...",Notes,1,...,http://www.jstor.org/stable/899971,54,51912,1,"[[221, 250]]","[[52471, 52500]]","[Paul Cauthen, Mark Palkovic]",,,
6819,2017-10-01,research-article,2017,2010,article,10.2307/26793237,http://www.jstor.org/stable/26793237,"[{'name': 'doi', 'value': '10.2307/26793237'},...",The English Historical Review,558,...,http://www.jstor.org/stable/26793237,132,48757,1,"[[331741, 331794]]","[[16074, 16127]]",,,,
6820,2017-04-01,research-article,2017,2010,article,10.2307/90001119,http://www.jstor.org/stable/90001119,"[{'name': 'doi', 'value': '10.2307/90001119'},...",The Eighteenth Century,1,...,http://www.jstor.org/stable/90001119,58,8914,1,"[[51039, 51075]]","[[5612, 5648]]",[Thomas Salem Manganaro],ABSTRACT This essay examines the case of actin...,,


## Let's find the start and end characters for a given quotation

In [27]:
processedText.text



In [28]:
# ACTION: replace string between quotation marks with desired quotation
quotation = 'novel is a\nfull and authentic report of human experience'

# ACTION: give this passage a shorthand name
passageShorthand = "novelFull"


In [29]:
index = processedText.text.index(quotation)

passageStart = index
passageEnd = index + len(quotation)
quoteIndex = [passageStart, passageEnd]

print('Starting index:', passageStart) 
print('Ending index:', passageEnd)
print(f'Character indexes for match: {quoteIndex}')
print("\n Corresponding text:")
processedText.text[index:index + len(quotation)]

Starting index: 61139
Ending index: 61195
Character indexes for match: [61139, 61195]

 Corresponding text:


'novel is a\nfull and authentic report of human experience'

In [30]:
# Let's look just at just our character indexes for quotations
df = df[['id', 'Year', 'Locations in A', 'Locations in B']]
df

Unnamed: 0,id,Year,Locations in A,Locations in B
0,http://www.jstor.org/stable/1346068,2001,"[[592927, 592954]]","[[10284, 10318]]"
2,http://www.jstor.org/stable/3825650,1964,"[[54877, 54953]]","[[25944, 26020]]"
4,http://www.jstor.org/stable/10.2979/reseafrili...,2012,"[[13006, 13044], [60749, 60916]]","[[22804, 22844], [22962, 23119]]"
5,http://www.jstor.org/stable/23041237,1997,"[[346867, 346904]]","[[286873, 286916]]"
6,http://www.jstor.org/stable/10.3998/mpub.11738099,2023,"[[5319, 5344]]","[[215144, 215166]]"
...,...,...,...,...
6815,http://www.jstor.org/stable/jj.5076351,2023,"[[331741, 331785]]","[[25385, 25429]]"
6817,http://www.jstor.org/stable/899971,1997,"[[221, 250]]","[[52471, 52500]]"
6819,http://www.jstor.org/stable/26793237,2017,"[[331741, 331794]]","[[16074, 16127]]"
6820,http://www.jstor.org/stable/90001119,2017,"[[51039, 51075]]","[[5612, 5648]]"


In [31]:
# Loop over DF to get ID and each individidual match in Locations in B and each in B
df = df.explode(['Locations in A', 'Locations in B'])


### Let's get all the ids, Locations in B and Locations in A for just our specified character index range (eg, squirrel's heartbeat

In [32]:
# Define a function that can compare two intervals in the form of listed tuples, eg [x0, x1], [y0,y1]
def overlaps(range1, range2):
    try:
        if len(range1) == 2 and len(range2) == 2:
            return max(range1[0],range2[0]) < min(range1[1],range2[1])
        else:
            return False
    except TypeError:
        return False

In [33]:
quotation_locations_in_A = []
quotation_locations_in_B = []
ids = []
years = []


# Let's extract just the Locations in A that overlap with the squirrel's hearbeat quotation
for item in range(len(df)):
    loc_in_a_index = df['Locations in A'].iloc[item]
    if overlaps(quoteIndex, loc_in_a_index) == True:
        quotation_locations_in_A.append(df['Locations in A'].iloc[item])
        quotation_locations_in_B.append(df['Locations in B'].iloc[item])
        ids.append(df['id'].iloc[item])
        years.append(df['Year'].iloc[item])

In [34]:
zipped = list(zip(ids, years, quotation_locations_in_A, quotation_locations_in_B))
just_matches_df = pd.DataFrame(zipped, columns=['id', 'Year', 'Locations in A', 'Locations in B'])
just_matches_df

Unnamed: 0,id,Year,Locations in A,Locations in B
0,http://www.jstor.org/stable/10.2979/pft.2005.2...,2005,"[61098, 61508]","[30901, 31308]"
1,http://www.jstor.org/stable/521614,1997,"[61139, 61195]","[25285, 25341]"
2,http://www.jstor.org/stable/41939307,2013,"[61150, 61195]","[8229, 8274]"
3,http://www.jstor.org/stable/1342865,1976,"[61098, 61508]","[11761, 12171]"
4,http://www.jstor.org/stable/41337580,2012,"[60957, 61508]","[66200, 66744]"
5,http://www.jstor.org/stable/40267751,2009,"[61150, 61195]","[10861, 10906]"
6,http://www.jstor.org/stable/j.ctvct017z.11,2015,"[61139, 61389]","[18620, 18872]"
7,http://www.jstor.org/stable/1772086,1986,"[61098, 61195]","[15805, 15902]"
8,http://www.jstor.org/stable/10.1525/rep.2012.1...,2012,"[61098, 61508]","[28556, 28966]"
9,http://www.jstor.org/stable/24353150,2012,"[61150, 61195]","[21676, 21721]"


In [35]:
just_matches_df.to_csv(f"{resultsDir}/{textTitle}_{passageShorthand}.csv")

# Alternate for VS


In [13]:
quotation_locations_in_A = []
quotation_locations_in_B = []
ids = []

for item in range(len(df_just_quotes)):
    loc_in_a_index = df_just_quotes['Locations in A'].iloc[item]
    quotation_locations_in_A.append(df_just_quotes['Locations in A'].iloc[item])
    quotation_locations_in_B.append(df_just_quotes['Locations in B'].iloc[item])
    ids.append(df_just_quotes['id'].iloc[item])

NameError: name 'df_just_quotes' is not defined

### Let's use the start and end characters to extract a context chunk left & right of our quotations

Set the `num_characters_before_quote` and `num_characters_after_quote`

In [None]:
# Use article_id to get the index of the article in our DataFrame
df_with_fulltexts = pd.read_json('default.json')
df_with_fulltexts['year'] = pd.DatetimeIndex(df_with_fulltexts['datePublished']).year
df_with_fulltexts['Decade'] = df_with_fulltexts['year'] - (df_with_fulltexts['year'] % 10)

In [None]:
# Change these to desired chunk
num_characters_before_quote = 200
num_characters_after_quote = 750
decades_to_check = [2010, 2000, 1990, 1980, 1970, 1960, 1950]
list_of_VS_journals = ['Victorian Studies', 'George Eliot - George Henry Lewes Studies', 'Nineteenth-Century Fiction', 'Nineteenth-Century Literature', 'Dickens Studies Annual', 'Victorian Literature and Culture', 'Victorian Review', 'The George Eliot, George Henry Lewes Newsletter', 'Victorian Periodicals Review', 'Dickens Quarterly', 'Victorian Poetry', 'The Thomas Hardy Journal', 'The Gaskell Society Journal', 'The Gaskell Journal', 'Newsletter of the Victorian Studies Association of Western Canada', 'Dickens Studies Newsletter', 'Browning Institute Studies', 'Victorian Periodicals Newsletter', 'Carlyle Studies Annual', 'Conradiana', 'Tennyson Research Bulletin', 'The Conradian', 'The Hardy Society Journal', 'The Hardy Review', 'Studies in Browning and His Circle', 'Nineteenth-Century French Studies', 'The Wilkie Collins Journal', 'Carlyle Newsletter', 'The Wildean', 'Dickens Studies', 'Carlyle Annual', '19th-Century Music', 'The Trollopian', 'Conrad Studies']

# Create an empty list that we will populate with the contexts for quotations
context_chunks_for_quotations = []
context_chunks_ids = []
context_chunks_decade = []


# Loop over each of the start and end locations to produce a x-character chunk of context
for item in range(len(just_matches_df)):
    article_URL = just_matches_df['id'].loc[item]
    startandEndLocations = just_matches_df['Locations in B'].loc[item]
    print(article_URL)
    print(startandEndLocations[0])
    print(startandEndLocations[1])
    article_index = df_with_fulltexts[df_with_fulltexts['id'] == article_URL].index[0]
    print(article_index)
    article_title = df_with_fulltexts['title'].loc[article_index]
    print(article_title)
    article_decade = df_with_fulltexts['Decade'].loc[article_index]
    print(article_decade)
    article_journal = df_with_fulltexts['isPartOf'].loc[article_index]
    article_text = df_with_fulltexts['fullText'].loc[article_index]
    cleaned_article_text = Text(article_text, article_title)
    if article_journal in list_of_VS_journals:
    #if article_decade in decades_to_check:
        context_chunks_for_quotations.append((cleaned_article_text.text[startandEndLocations[0]-num_characters_before_quote:startandEndLocations[0]]) + "[...]" + (cleaned_article_text.text[startandEndLocations[1]:startandEndLocations[1]+num_characters_after_quote]))
        context_chunks_ids.append(df_with_fulltexts['id'].loc[article_index])
        context_chunks_decade.append(df_with_fulltexts['Decade'].loc[article_index])

### Print the contexts around quotations:

In [None]:
context_chunks_for_quotations

In [None]:
context_chunks_ids

### Print the context around the fifth quotation

In [None]:
print(context_chunks_for_quotations[3])

## To write output to a single text file

In [None]:
name_of_quote = f"{name_of_passage}-{num_characters_before_quote}-char-before-and-{num_characters_after_quote}-after"
output_file = open(f'VS-and-Victorian-studies/quotation-contexts-{name_of_quote}.txt', mode='w', encoding='utf-8')

for context in context_chunks_for_quotations:
     output_file.write(context)
     output_file.write('\n')
output_file.close()

In [None]:
#name_of_quote = f"{name_of_passage}-fulltexts"

## To write both contexts and IDs to a CSV


In [None]:
zipped = list(zip(context_chunks_ids, context_chunks_decade, context_chunks_for_quotations))
chunks_with_ids_df = pd.DataFrame(zipped, columns=['id', 'Decade', 'Quotation Context Chunk'])
chunks_with_ids_df.to_csv(f'quotation-contexts-ids-{name_of_quote}.csv', encoding ='utf-8')

## To write output to multiple text files in a new directory

In [None]:
# To output our context for quotations as a series of new files with the same beginning, followed by the number of the section

#Import pathlib 
from pathlib import Path

# Define and name the new output directory using pathlib
path = Path(f'{name_of_quote}_contexts/')
path.mkdir(exist_ok=True)

# Set the prefix for our output files, followed by the number of the section
begining_of_output_filenames = f'{name_of_quote}_contexts-'

# Iterate over each of the chunks of context for BookNLP NER
for i in range(1, len(context_chunks_for_quotations)+1):
    open(str(path) + "/" + begining_of_output_filenames+str(i)+'.txt','w').write(context_chunks_for_quotations[i-1])

## To look at most frequent words

In [None]:
import re
from collections import Counter

# Define Functions

def split_into_words(any_chunk_of_text):
    lowercase_text = any_chunk_of_text.lower()
    split_words = re.split("\W+", lowercase_text) 
    return split_words

# Define Filepaths and Assign Variables

#directory_path = f'{name_of_quote}_contexts/'
filename = f'VS-and-Victorian-studies/quotation-contexts-{name_of_quote}.txt'
number_of_desired_words = 100

stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 've', 'll', 'amp',
]

most_frequent_non_stopwords = []


full_text = open(filename, encoding="utf-8", errors='ignore').read()
all_the_words = split_into_words(full_text)
non_stopwords = [word for word in all_the_words if word not in stopwords]
non_stopwords_tally = Counter(non_stopwords)
most_frequent_non_stopwords_to_add = non_stopwords_tally.most_common(number_of_desired_words)
most_frequent_non_stopwords.append(most_frequent_non_stopwords_to_add)

# Output Results

most_frequent_non_stopwords


In [None]:
output_file = open(f'VS-and-Victorian-studies/quotation-contexts-most-frequent-words-{name_of_quote}.txt', mode='w', encoding='utf-8')

for item in most_frequent_non_stopwords:
    for word in item:
        print(word[0])
        output_file.write(word[0])
        output_file.write('\n')
output_file.close()

## To Topic Model our  contents

In [None]:
#!pip install little_mallet_wrapper

In [None]:
import little_mallet_wrapper
import seaborn
import glob
from pathlib import Path

## NOTE: we have to install openjdk + mallet
To install open Java Development Kit, either download from the 

## Step 1: Download and Install Java Development Kit

Go to the Java Development Kit download page, find your operating system, and click on the corresponding download link: https://www.oracle.com/java/technologies/javase-jdk14-downloads.html

Choose the most recent JDK (currently JDK 16) and click "JDK Download." This will take you to an installation page. Choose the appropriate file to download: 

- For Mac: macOS Installer (ARM is for Macs with M1/M2 chips, x64 is for Macs with Intel processors)
- For Windows:  Windowsx64 Installer
- For Linux: Windowsx64 Installer

Then open or unzip the file and follow all the instructions to complete the installation.

## Step 2: Tell your computer where to find Java

Now we have to tell our computers where to find JDK. For Macs, this involves defining a new variable, called JAVA_HOME and give it the filepath of our new JDK. For Windows, we edit a PATH variable to include the file path of JDK.

### For Mac

Open up your Terminal. Check whether the language at the top is "bash" or "zsh"

![image](../_images/command-line-language.png)

 - If the top of your command prompt shell says "bash"....
	-  You can run the following on the command line to set up the JAVA_HOME environment variable. The line of code adds your JAVA_HOME variable to a file called “bash_profile”, which is where environment variables are stored. `!echo "export JAVA_HOME=$(/usr/libexec/java_home)" >> ~/.bash_profile`
	-  Then, to immediately update your "bash profile", run: `source ~/.bash_profile`
	-  Then, to test whether Java correctly installed, type `javac`. If you get a list of options, you correctly installed it. If your command line says "Not recognized," then it is not correctly installed.
-  If the top of your command prompt shell says "zsh"...
	-  You can run the following on the command line to set up the JAVA_HOME environment variable. The line of code adds your JAVA_HOME variable to a file called “zshrc”, which is where environment variables are stored. `echo "export JAVA_HOME=$(/usr/libexec/java_home)" >> ~/.zshrc`
	-  Then, to immediately update your "zshrc", run: `source ~/.zshrc`
	-  Then, to test whether Java correctly installed, type `javac`. If you get a list of options, you correctly installed it. If your command line says "Not recognized," then it is not correctly installed.



##  To  install Mallet
To download MALLET, click the following [link](http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip) or find the link on the [MALLET home page.](https://mimno.github.io/Mallet/) Once the zip file downloads, unzip it.

Once installed, verify that it's working

To test whether MALLET works on your computer, type in the file path for MALLET on the command line and import-file, like so:

- Mac:  `~/mallet-2.0.8/bin/mallet import-file` 
- Windows: `C:\mallet-2.0.8.\bin\mallet import-file`

If it’s working, then you’ll get a message that says “A tool for creating instance lists of feature vectors from comma-separated-values” and a list of options.

If you get a permissions error, type `chmod +x mallet-2.0.8/bin/mallet`

In [None]:
path_to_mallet = 'mallet-2.0.8/bin/mallet'

In [None]:
directory = f'{name_of_quote}_contexts/'

In [None]:
files = glob.glob(f"{directory}/*.txt")

In [None]:
files

In [None]:
training_data = []
for file in files:
    text = open(file, encoding='utf-8').read()
    processed_text = little_mallet_wrapper.process_string(text, numbers='remove')
    training_data.append(processed_text)

In [None]:
original_texts = []
for file in files:
    text = open(file, encoding='utf-8').read()
    original_texts.append(text)

In [None]:
context_titles = [Path(file).stem for file in files]
# The Path().stem function extract the filename without the .txt extension

In [None]:
context_titles

In [None]:
little_mallet_wrapper.print_dataset_stats(training_data)

In [None]:
num_topics = 5 # Change this number to change the number of topicsnum_topics = 10 # Change this number to change the number of topics

In [None]:
training_data = training_data

In [None]:
#Change to your desired output directory
output_directory_path = f'topic-model-output/{name_of_quote}'

#No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = output_directory_path + '/training.txt'
path_to_formatted_training_data = output_directory_path + '/mallet.training'
path_to_model                   = output_directory_path + '/mallet.model.' + str(num_topics)
path_to_topic_keys              = output_directory_path + '/mallet.topic_keys.' + str(num_topics)
path_to_topic_distributions     = output_directory_path + '/mallet.topic_distributions.' + str(num_topics)
path_to_word_weights            = output_directory_path + '/mallet.word_weights.' + str(num_topics)
path_to_diagnostics             = output_directory_path + '/mallet.diagnostics.' + str(num_topics) + '.xml'



In [None]:
little_mallet_wrapper.import_data(path_to_mallet,
                path_to_training_data,
                path_to_formatted_training_data,
                training_data)

In [None]:
little_mallet_wrapper.train_topic_model(path_to_mallet,
                      path_to_formatted_training_data,
                      path_to_model,
                      path_to_topic_keys,
                      path_to_topic_distributions,
                      path_to_word_weights,
                      path_to_diagnostics,
                      num_topics)

### Display Topics and Top Words

In [None]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)

for topic_number, topic in enumerate(topics):
    print(f"Topic {topic_number}\n\n{topic}\n")

## Load Topic Distributions

In [None]:
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)

In [None]:
#Let's look at number 5
topic_distributions[5]

In [None]:
context_to_check = f"{name_of_quote}_contexts-6"

chunk_number = context_titles.index(context_to_check)

print(f"Topic Distributions for {context_titles[chunk_number]}\n")

print("Chunk contents:\n")
print(open(f'{directory}/{context_to_check}.txt', 'r').read())
print('\n')
for topic_number, (topic, topic_distribution) in enumerate(zip(topics, topic_distributions[chunk_number])):
    print(f"Topic {topic_number} {topic[:20]} \nProbability: {round(topic_distribution, 3)}\n")

### Top context chunks

In [None]:
training_data_context_titles = dict(zip(training_data, context_titles))
training_data_original_text = dict(zip(training_data, original_texts))

In [None]:
def display_top_titles_per_topic(topic_number=0, number_of_documents=5):
    
    print(f"Topic {topic_number}\n\n{topics[topic_number]}\n")

    for probability, document in little_mallet_wrapper.get_top_docs(training_data, topic_distributions, topic_number, n=number_of_documents):
        print(round(probability, 4), training_data_context_titles[document] + "\n")
    return

In [None]:
display_top_titles_per_topic(topic_number=0, number_of_documents=7)

In [None]:
display_top_titles_per_topic(topic_number=1, number_of_documents=5)

In [None]:
display_top_titles_per_topic(topic_number=2, number_of_documents=6)

In [None]:
display_top_titles_per_topic(topic_number=3, number_of_documents=6)

In [None]:
display_top_titles_per_topic(topic_number=4, number_of_documents=6)

In [None]:
display_top_titles_per_topic(topic_number=5, number_of_documents=6)

In [None]:
display_top_titles_per_topic(topic_number=6, number_of_documents=6)

In [None]:
display_top_titles_per_topic(topic_number=7, number_of_documents=6)

In [None]:
display_top_titles_per_topic(topic_number=8, number_of_documents=6)

In [None]:
display_top_titles_per_topic(topic_number=9, number_of_documents=6)