# Load Data

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
categories = [
    "alt.atheism",
    "comp.graphics",
    "comp.sys.ibm.pc.hardware",
    "misc.forsale",
    "rec.autos",
    "sci.space",
    "talk.religion.misc",
]
print("Loading 20 newsgroups training data")
raw_data, _ = fetch_20newsgroups(subset="train", categories=categories, return_X_y=True)
data_size_mb = sum(len(s.encode("utf-8")) for s in raw_data) / 1e6
print(f"{len(raw_data)} documents - {data_size_mb:.3f}MB")

['Subject: Re: Christian Daemons? [Biblical Demons, the u\nFrom: stigaard@mhd.moorhead.msus.edu\nReply-To: stigaard@mhd.moorhead.msus.edu\nOrganization: Moorhead State University, Moorhead, MN\nNntp-Posting-Host: 134.29.97.2\nLines: 23\n\n>>>667\n>>>the neighbor of the beast\n>>\n>>No, 667 is across the street from the beast.  664 and 668 are the\n>>neighbors of the beast.\n>\n>I think some people are still not clear on this:\n>667 is *not* the neighbor of the beast, but, rather, across the\n>street. It is, in fact, 668 which is the neighbor of the beast.\n\nno, sheesh, didn\'t you know 666 is the beast\'s apartment?  667 is across the\nhall from the beast, and is his neighbor along with the rest of the 6th floor.\n\n>Justin (still trying to figure out what this has to do with alt.discordia)\n\nThis doesn\'t seem discordant to you?\n\n-----------------------     ----------------------     -----------------------\n\t-Paul W. Stigaard, Lokean Discordian Libertarian\n  !XOA!\t\tinternet: 

# Define preprocessing functions

In [4]:
import re

In [5]:
def tokenize(doc): 
    """Extract tokens from doc. 

    This uses a simple regex that matches word characters to break strings
    into tokens. For a more principled approach, see CountVectorizer or TfidfVectorizer.
    """

    return [tok.lower() for tok in re.findall(r"\w+", doc)]

list(tokenize("This is a simple example, isn't it?"))

['this', 'is', 'a', 'simple', 'example', 'isn', 't', 'it']

In [6]:
from collections import defaultdict

In [7]:
def token_freqs(doc): 
    """Extract a dict mapping tokens from doc to their occurrences.""" 

    freq = defaultdict(int)
    for tok in tokenize(doc): 
        freq[tok] += 1
    return freq

token_freqs("That is one example, but this is another")

defaultdict(int,
            {'that': 1,
             'is': 2,
             'one': 1,
             'example': 1,
             'but': 1,
             'this': 1,
             'another': 1})

# DictVectorizer

In [8]:
from time import time
from sklearn.feature_extraction import DictVectorizer

In [9]:
# track the performance of the different vectorizers
dict_count_vectorizers = defaultdict(list)

t0 = time() 
vectorizer = DictVectorizer()
vectorizer.fit_transform(token_freqs(d) for d in raw_data)
duration = time() - t0 

dict_count_vectorizers["vectorizer"].append(
    vectorizer.__class__.__name__ + "\non freq dicts"
)
dict_count_vectorizers["speed"].append(data_size_mb / duration)
print(f"done in {duration:.3f}s at {data_size_mb / duration:.1f} MB/s")
print(f"Found {len(vectorizer.get_feature_names_out())} unique terms")

done in 1.053s at 5.9 MB/s
Found 47928 unique terms


In [16]:
dict_count_vectorizers["vectorizer"]

['DictVectorizer\non freq dicts']

In [10]:
type(vectorizer.vocabulary_)

dict

In [11]:
list(vectorizer.vocabulary_.items())[-1]

('appease', 8590)

In [12]:
len(vectorizer.vocabulary_)

47928

# Daniel's Work

### sentimatnet analysis

In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [139]:
# Getting books
books_300 = pd.read_csv('../Daniels_work/sample300books.csv')

# Identify duplicates by book title
duplicates = books_300['title'].duplicated(keep='first')

# Drop duplicate books based on their index
books_300.drop(index=books_300[duplicates].index, inplace=True)

#Dropping null values
books_300 = books_300 = books_300.dropna(subset=['description']).reset_index(drop=True)
books_300

Unnamed: 0,id,title,isbn,page_count,publishing_date,form,publisher,language,author,illustrator,originally_published,genres,subject,awards,nominations,characters,description,sub_title
0,h2Y-PgAACAAJ,Harry Potter and the Chamber of Secrets,"9780439554893, 0439554896",341,1999,Hardcover,Scholastic Press,English,J. K. Rowling,Mary GrandPré,"July 2, 1998","Novel, Fantasy Fiction, Bildungsroman, High fa...",,Nestlé Smarties Book Prize for 9 to 11 years,Guardian Children's Fiction Prize,"Harry Potter, Hermione Granger, Dobby, Profess...",When the Chamber of Secrets is opened again at...,
1,FBXRzgEACAAJ,Harry Potter and the Prisoner of Azkaban,"9780439655484, 043965548X",560,May 2004,Trade paperback,Arthur A. Levine Books,English,J. K. Rowling,Mary GrandPré,"July 8, 1999","Novel, Fantasy Fiction, Adventure fiction, Con...",,"Locus Award for Best Fantasy Novel, Nestlé Sma...","Carnegie Medal for Writing, Hugo Award for Bes...","Harry Potter, Hermione Granger, Sirius Black, ...",The third book in J.K. Rowling's bestselling s...,
2,LH5C9q83T6wC,7,"9780976540601, 0976540606",152,2005,Paperback,Nimble Books LLC,English,W. Frederick Zimmerman,,December 2005,,"Children's stories, English, Fantasy fiction, ...",,,,Through the magic of print-on-demand technolog...,"Unauthorized Harry Potter Book Seven News ; ""H..."
3,joXgAAAAMAAJ,The Ultimate Hitchhiker's Guide,"9780517226957, 0517226952",815,"November 1, 2005",Hardcover,Gramercy Books,English,Douglas Adams,,"January 17, 1996","Science fiction, Humor","Dent, Arthur (Fictitious character) -- Fiction...",,,,"This outrageous volume contains six zany, out-...",Five Complete Novels and One Story
4,4m0Qj9xKksYC,The Ultimate Hitchhiker's Guide to the Galaxy,"9780345453747, 0345453743",815,"April 30, 2002",Trade paperback,Random House Worlds,English,Douglas Adams,,"January 17, 1996","Science fiction, Humor","Fiction / Humorous / General, Fiction / Scienc...",,,,"In one complete volume, here are the five clas...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260,uDINAQAAMAAJ,Aristophanes and Athens,"9780198721581, 0198721587",362,1995,Paperback,Oxford University Press,English,Douglas Maurice MacDowell,,1995,,"Athens (Greece) -- In literature, Athens (Gree...",,,,This book provides an accessible introduction ...,An Introduction to the Plays
261,ocDfwicPJMwC,Lysistrata,"9780872206038, 0872206033",126,2003,Paperback,Hackett Publishing Company,English,Aristophanes,,,,"Feminists -- Humor -- Drama -- Greece, Greece ...",,,"Myrrhine, Calonice, Lampito, Stratyllis, Lysis...",This rollicking new translation of Aristophane...,
262,oudzAAAAIAAJ,"The Knights, Peace, Wealth, The Birds, The Ass...","9780140443325, 0140443320",335,1978,Paperback,Penguin,English,Aristophanes,,,,"Athens (Greece) -- Drama, Athens -- Drama -- G...",,,,Aristophanes is the only surviving representat...,
263,n3MeQikAp00C,Genres in Dialogue,"9780521774338, 0521774330",222,"April 13, 2000",Paperback,Cambridge University Press,English,Andrea Wilson Nightingale,,"December 14, 1995",,"Literary Collections / Ancient & Classical, Ph...",,,,This 1995 book takes as its starting point Pla...,Plato and the Construct of Philosophy


In [5]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Cleaning texts

In [120]:
# Define a function to preprocess text using NLTK
def preprocess_text(text):
    # Tokenize the text into words
    words = word_tokenize(text.lower())
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    # Lemmatize the words ~ normalization of words basically
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    
    # Join the words back into a string
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text

In [121]:
# Filter out non-string descriptions
descriptions = books_300['description'].astype(str)

In [122]:
# Preprocess the descriptions using the preprocess_text function
preprocessed_descriptions = descriptions.apply(preprocess_text)
print(preprocessed_descriptions.head(10))


0    chamber secret opened hogwarts school witchcra...
1    third book j.k. rowling 's bestselling series ...
2    magic print-on-demand technology , `` nimble '...
3    outrageous volume contains six zany , out-of-t...
4    one complete volume , five classic novel dougl...
5    one world ’ beloved writer new york time bests...
6    author short history nearly everything body co...
7    one english language ’ skilled beloved writer ...
8    every time bill bryson walk door , memorable t...
9    classic new york time bestselling author walk ...
Name: description, dtype: object


In [107]:
from textblob import TextBlob

# Create a function to get the sentiment polarity of a given text
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity

# Apply the sentiment analysis function to the preprocessed descriptions
sentiments = preprocessed_descriptions.apply(get_sentiment)


In [108]:
sentiments

0     -0.275000
1      0.000000
2      0.240487
3      0.240000
4      0.273542
         ...   
260    0.116342
261    0.118733
262    0.081250
263    0.090606
264    0.003030
Name: description, Length: 265, dtype: float64

In [109]:
books_300['sentiment'] = sentiments
books_300.head(11)

Unnamed: 0,id,title,isbn,page_count,publishing_date,form,publisher,language,author,illustrator,originally_published,genres,subject,awards,nominations,characters,description,sub_title,sentiment
0,h2Y-PgAACAAJ,Harry Potter and the Chamber of Secrets,"9780439554893, 0439554896",341,1999,Hardcover,Scholastic Press,English,J. K. Rowling,Mary GrandPré,"July 2, 1998","Novel, Fantasy Fiction, Bildungsroman, High fa...",,Nestlé Smarties Book Prize for 9 to 11 years,Guardian Children's Fiction Prize,"Harry Potter, Hermione Granger, Dobby, Profess...",When the Chamber of Secrets is opened again at...,,-0.275
1,FBXRzgEACAAJ,Harry Potter and the Prisoner of Azkaban,"9780439655484, 043965548X",560,May 2004,Trade paperback,Arthur A. Levine Books,English,J. K. Rowling,Mary GrandPré,"July 8, 1999","Novel, Fantasy Fiction, Adventure fiction, Con...",,"Locus Award for Best Fantasy Novel, Nestlé Sma...","Carnegie Medal for Writing, Hugo Award for Bes...","Harry Potter, Hermione Granger, Sirius Black, ...",The third book in J.K. Rowling's bestselling s...,,0.0
2,LH5C9q83T6wC,7,"9780976540601, 0976540606",152,2005,Paperback,Nimble Books LLC,English,W. Frederick Zimmerman,,December 2005,,"Children's stories, English, Fantasy fiction, ...",,,,Through the magic of print-on-demand technolog...,"Unauthorized Harry Potter Book Seven News ; ""H...",0.240487
3,joXgAAAAMAAJ,The Ultimate Hitchhiker's Guide,"9780517226957, 0517226952",815,"November 1, 2005",Hardcover,Gramercy Books,English,Douglas Adams,,"January 17, 1996","Science fiction, Humor","Dent, Arthur (Fictitious character) -- Fiction...",,,,"This outrageous volume contains six zany, out-...",Five Complete Novels and One Story,0.24
4,4m0Qj9xKksYC,The Ultimate Hitchhiker's Guide to the Galaxy,"9780345453747, 0345453743",815,"April 30, 2002",Trade paperback,Random House Worlds,English,Douglas Adams,,"January 17, 1996","Science fiction, Humor","Fiction / Humorous / General, Fiction / Scienc...",,,,"In one complete volume, here are the five clas...",,0.273542
5,YjAnfhsAQ8wC,A Short History of Nearly Everything,"9780767908184, 076790818X",544,"September 14, 2004",Paperback,Crown,English,Bill Bryson,,"February 4, 2003","Non-fiction, Popular science","History / World, Science / Essays, Science / N...",,Baillie Gifford Prize,,One of the world’s most beloved writers and Ne...,,0.305195
6,yfSOEAAAQBAJ,Bill Bryson's African Diary,"9780767915069, 0767915062",49,"December 3, 2002",Hardcover,Crown,English,Bill Bryson,,"November 26, 2002",,"Biography & Autobiography / Personal Memoirs, ...",,,,From the author of A Short History of Nearly E...,,0.037229
7,nRduHUeIzvAC,Bryson's Dictionary of Troublesome Words,"9780767910439, 0767910435",256,"September 14, 2004",Paperback,Crown,English,Bill Bryson,,1984,,Language Arts & Disciplines / Spelling & Vocab...,,,,One of the English language’s most skilled and...,A Writer's Guide to Getting It Right,0.18443
8,7ZELqUsksIwC,In a sunburned country,"9780767903868, 0767903862",335,"May 15, 2001",Paperback,Crown,English,Bill Bryson,,"June 6, 2000",,"Biography & Autobiography / Personal Memoirs, ...",,,,"Every time Bill Bryson walks out the door, mem...",,0.011905
9,xb4wSmJLnhAC,I'm a Stranger Here Myself,"9780767903820, 076790382X",304,"June 6, 2000",Paperback,Crown,English,Bill Bryson,,"November 5, 1998","Travel literature, Humor, Autobiography, Memoir","Biography & Autobiography / Personal Memoirs, ...",,,,A classic from the New York Times bestselling ...,Notes on Returning to America After 20 Years Away,0.112626


In [114]:
# Get the 10 books with the most negative sentiments
top_negative = books_300.loc[sentiments.nsmallest(10).index][['title', 'author', 'description', 'genres', 'sentiment']]
print("Top 10 books with the most negative sentiments:")
top_negative

Top 10 books with the most negative sentiments:


Unnamed: 0,title,author,description,genres,sentiment
98,El perfume,Patrick Süskind,Jean Baotiste Grenouille nació con muy poca su...,"Novel, Mystery, Horror fiction, Magical Realis...",-0.85
47,There's Always Enough,"Rolland Baker, Heidi Baker",Tells the story of an adventure that began whe...,,-0.4
79,Dalit,V. T. Rajshekar Shetty,"After Centuries of slavery, apartheid and ethn...",,-0.4
143,Motorcycle Basics Techbook,"Matthew Coombs, John Haynes, Pete Shoemark","All common engine, chassis and electrical syst...",,-0.3
188,Memoirs of a Geisha,Arthur Golden,The strikingly pretty child of an impoverished...,"Novel, Historical Fiction",-0.294444
172,Cryptonomicon Tome 2 : Le réseau Kinakuta,Neal Stephenson,Analyse : Roman de science-fiction (hard scien...,"Science fiction, Adventure fiction",-0.291667
0,Harry Potter and the Chamber of Secrets,J. K. Rowling,When the Chamber of Secrets is opened again at...,"Novel, Fantasy Fiction, Bildungsroman, High fa...",-0.275
82,Una arruga en el tiempo,,Two courageous children embark on a dangerous ...,"Novel, Science fiction, Young adult fiction, F...",-0.233333
244,The Bacchae and Other Plays,Euripides,"Through their sheer range, daring innovation, ...","Poetry, Drama",-0.231944
43,Always Enough,"Rolland Baker, Heidi Baker","Even the most desperate poverty, the most deva...","Biography, Christian literature",-0.186905


In [119]:
# Get the 10 books with the most positive sentiments
top_positive = books_300.loc[sentiments.nlargest(10).index][['title', 'author', 'description', 'genres', 'sentiment']]
print("\nTop 10 books with the most positive sentiments:")
top_positive


Top 10 books with the most positive sentiments:


Unnamed: 0,title,author,description,genres,sentiment
40,The Heidi Chronicles,Wendy Wasserstein,THE STORY: Comprised of a series of interrelat...,,0.75
88,Henry Miller on Writing,Henry Miller,Some of the most rewarding pages in Henry Mill...,,0.65
215,Juiced,Doug Walsh,"BradyGames' ""Juiced Official Strategy Guide"" i...",,0.548611
69,Ruby Ann's Down Home Trailer Park BBQin' Cookbook,Ruby Ann Boxcar,b/w photos throughout. The voluptuous gourmet ...,"Humor, Cookbook",0.52
65,EBay Timesaving Techniques For Dummies,Marsha Collier,"So, you’ve gotten started on eBay. You’ve made...",,0.51
127,Nikola Tesla,Carol Dommermuth-Costa,These informative and inspiring biographies wi...,,0.5
169,Lonely Planet London 2002,Mark Honan,Discovering great food in wonderful restaurant...,,0.491667
138,Killing Yourself to Live,Chuck Klosterman,Building on the national bestselling success o...,"Biography, Autobiography",0.484091
45,Heidi,Johanna Spyri,Heidi was first published in 1880. A classic t...,"Novel, Children's literature, Fiction",0.479167
94,The Ravishing of Lol Stein,Marguerite Duras,The Ravishing of Lol Stein is a haunting early...,,0.471429


# Jaccard Similarity (daniels)

In [162]:
# Getting fresh books
books_300_2 = pd.read_csv('../Daniels_work/sample300books.csv')
print(books_300_2.shape)

books_300_2 = books_300_2.dropna(subset=['genres']).reset_index(drop=True)

# Filter out non-string descriptions
genres = books_300_2['genres'].astype(str)
print(genres.shape)


(300, 18)
(151,)


In [163]:
# Preprocess the genres column using the preprocess_text function
preprocessed_genres = genres.apply(preprocess_text)
preprocessed_genres

0      novel , fantasy fiction , bildungsroman , high...
1      novel , fantasy fiction , adventure fiction , ...
2      novel , fantasy fiction , adventure fiction , ...
3                                science fiction , humor
4                                science fiction , humor
                             ...                        
146                                      drama , tragedy
147                                       drama , comedy
148                    drama , comedy , humorous fiction
149                                       drama , comedy
150      novel , mystery , bildungsroman , crime fiction
Name: genres, Length: 151, dtype: object

In [164]:
def jaccard_similarity(s1, s2):
    set1 = set(s1)
    set2 = set(s2)
    if len(set1 | set2) == 0:
        return 0
    return len(set1 & set2) / len(set1 | set2)


In [165]:
similarities = []
for i, row in preprocessed_genres.to_frame().iterrows():
    genres1 = row['genres']
    similarity_scores = []
    for j, row2 in preprocessed_genres.to_frame().iterrows():
        genres2 = row2['genres']
        similarity = jaccard_similarity(genres1, genres2)
        similarity_scores.append(similarity)
    similarities.append(similarity_scores)


In [166]:
len(similarities)

151

In [168]:
# Select a book
selected_book = books_300_2.iloc[22]['title']

# Gets the index of the selected book
book_index = books_300_2.loc[books_300_2['title'] == selected_book].index[0]

# Calculate the similarity scores between the selected book and all other books
similarity_scores = similarities[book_index]

# Create a list of similar books with their similarity scores
similar_books = [(score, books_300_2.loc[i]['title']) for i, score in enumerate(similarity_scores) if i != book_index]

# Add the selected book to the list of similar books
selected_book_tuple = (1.0, selected_book)
similar_books.append(selected_book_tuple)

# Sort the list of similar books in descending order based on the similarity scores
similar_books.sort(reverse=True)

# Print out the top 10 most similar books
print(f"Books similar to '{selected_book}':")
for i, book in enumerate(similar_books[:10]):
    print(f"{i+1}. {book[1]} ({book[0]:.2f} similarity)")


Books similar to 'Coming into the country':
1. Long Way Round (1.00 similarity)
2. Coming into the country (1.00 similarity)
3. I'm a Stranger Here Myself (0.94 similarity)
4. Tropic of Capricorn (0.84 similarity)
5. Heirs of General Practice (0.83 similarity)
6. The White Album (0.82 similarity)
7. Untouchables (0.81 similarity)
8. No Price Too High (0.81 similarity)
9. My Inventions (0.81 similarity)
10. Living to Tell the Tale (0.81 similarity)
