In [6]:
import re

class RepeatReplacer:

  def __init__(self):
    self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    self.repl = r'\1\2\3'

  def replace(self, word):
    repl_word = self.repeat_regexp.sub(self.repl, word)

    if repl_word != word:
      return self.replace(repl_word)
    else:
      return repl_word
    
replacer = RepeatReplacer()
text = replacer.replace("looooove")
text1 = replacer.replace("gooose")
text2 = replacer.replace("ooooooohh")
print(text)
print(text1)
print(text2)

love
gose
oh


In [8]:
import re
from nltk.corpus import wordnet

class RepeatReplacer:

  def __init__(self):
    self.repeat_regexp = re.compile(r'(\w*)(\w)\2(\w*)')
    self.repl = r'\1\2\3'

  def replace(self, word):
    if wordnet.synsets(word):
      return word
    repl_word = self.repeat_regexp.sub(self.repl, word)

    if repl_word != word:
      return self.replace(repl_word)
    else:
      return repl_word
    
text = replacer.replace("goose")
text1 = replacer.replace("oooooh")
print(text)
print(text1)

gose
oh


In [1]:
class WordReplacer:
  def __init__(self,word_map):
    self.word_map = word_map

  def replace(self,word):
    return self.word_map.get(word,word)
  
replacer = WordReplacer({"bday" : "Birthday"})
replacer.replace("bday")

'Birthday'

In [2]:
import csv

class CsvWordReplacer(WordReplacer):

  def __init__(self,fname):
    word_map = {}
    for line in csv.reader(open(fname)):
      word, syn  = line
      word_map[word] = syn 
    super(CsvWordReplacer, self).__init__(word_map)

replacer = CsvWordReplacer("WordnetSynonyms.csv")
replacer.replace("happy")


ValueError: too many values to unpack (expected 2)

In [8]:
import csv

class CsvWordReplacer(WordReplacer):
    def __init__(self, fname):
        word_map = {}
        with open(fname, 'r') as file:
            reader = csv.DictReader(file)
            for row in reader:
                word = row['Word']
                synonyms = row['Synonyms']
                word_map[word] = synonyms
        super().__init__(word_map)

replacer = CsvWordReplacer("Synonyms.csv")

text = replacer.replace("bf")
text1 = replacer.replace("bday")
text2 = replacer.replace("pls")
text3 = replacer.replace("bff")
text4 = replacer.replace("cud")

print(text)
print(text1)
print(text2)
print(text3)
print(text4)

boyfriend
birthday
please
best friend forever
could


In [3]:
import pandas as pd
import csv
# Create a list of data
data = [
    ["bday", "birthday"],
    ["bff", "best friend forever"],
    ["lol", "laugh out loud"],
    ["gr8", "great"],
    ["gonna", "going to"],
    ["wanna", "want to"],
    ["cuz", "because"],
    ["thx", "thanks"],
    ["pls","please"],
    ["u", "you"],
    ["2day","today"],
    ["tmrw","tomorrow"],
    ["wknd","weekend"],
    ["gf","girlfriend"],
    ["bf","boyfriend"],
    ["wut","what"],
    ["thru","through"],
    ["cud","could"],
    ["pls","please"],
    ["tho","though"],
]

# Create a DataFrame
df = pd.DataFrame(data, columns=["Word", "Synonyms"])

# Save the DataFrame to an Excel file
df.to_csv("Synonyms.csv", index=False)


print("Synonyms.csv' file")


Synonyms.csv' file


In [54]:
from nltk.corpus import wordnet

class AntonymReplacer:

    def replace(self, word, pos=None):
        antonyms = set()

        for syn in wordnet.synsets(word, pos=pos):
            for lemma in syn.lemmas():
                for antonym in lemma.antonyms():
                    antonyms.add(antonym.name())

        if len(antonyms) == 1:
            return antonyms.pop()
        else:
            return None

    def replace_negations(self, sent):
        i, l = 0, len(sent)
        words = []
        while i < l:
            word = sent[i]
            if word == "not" and i + 1 < l:
                ant = self.replace(sent[i + 1])
                if ant:
                    words.append(ant)
                    i += 2
                    continue
            words.append(word)
            i += 1
        return words


replacer = AntonymReplacer()

sent = ["let's", 'not', 'uglify', 'our', 'code']


print(replacer.replace_negations(sent))


["let's", 'beautify', 'our', 'code']


In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in California on April 1, 1976."

doc = nlp(text)

for ent in doc.ents:
  print(ent.text, "_", ent.label)


Apple Inc. _ 383
Steve Jobs _ 380
Steve Wozniak _ 380
California _ 384
April 1, 1976 _ 391


In [3]:
import spacy
from colorama import Fore, Style

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Define your text
text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in California on April 1, 1976."

# Process the text with spaCy
doc = nlp(text)

# Define color mappings for entity types
color_map = {
    "PERSON": Fore.BLUE,
    "ORG": Fore.GREEN,
    "GPE": Fore.RED,
    "DATE": Fore.YELLOW,
    # Add more entity types and corresponding colors if needed
}

# Iterate through the entities and print them with color
for ent in doc.ents:
    full_entity = text[ent.start_char:ent.end_char]
    label = ent.label_
    color = color_map.get(label, Fore.WHITE)  # Default color is white
    print(color + full_entity + " [" + label + "]" + Style.RESET_ALL)


TypeError: 'function' object is not iterable

In [11]:
import spacy
from spacy import displacy

# Load the English language model
nlp = spacy.load("en_core_web_lg")

# Define your text
text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in California on April 1, 1976."

# Process the text with spaCy
doc = nlp(text)

# # Define color mappings for entity types
# color_map = {
#     "PERSON": "#00FFFF",  # Cyan
#     "ORG": "#00FF00",     # Green
#     "GPE": "#FF0000",     # Red
#     "DATE": "#FFFF00",    # Yellow
#     "TIME": "#FF1493",    # DeepPink
#     "MONEY": "#FFD700",   # Gold
#     "PERCENT": "#9370DB", # MediumPurple
#     "QUANTITY": "#20B2AA",# LightSeaGreen
#     "ORDINAL": "#FF8C00", # DarkOrange
#     "CARDINAL": "#CD5C5C" # IndianRed
# }

# Options for displacy rendering
# options = {"ents": list(color_map.keys()), "colors": color_map}

# Render the entities with words using displacy
displacy.render(doc, style="ent")


In [17]:
import spacy
from spacy import displacy

# Load the English language model
nlp = spacy.load("en_core_web_lg")

# Define your text
text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in California on April 1, 1976."

# Process the text with spaCy
doc = nlp(text)

# Render the entities with words using displacy
displacy.render(doc, style="ent")


In [20]:
import spacy
from spacy import displacy
from nltk.tokenize import word_tokenize

# Load the English language model
nlp = spacy.load("en_core_web_lg")

# Define your text
text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in California on April 1, 1976."

# Tokenize the text using NLTK
word_tokens = word_tokenize(text)

# Join the tokens back into a single string
text = " ".join(word_tokens)

# Process the text with spaCy
doc = nlp(text)

# Render the entities with words using displacy
displacy.render(doc, style="ent")


In [30]:
import spacy
from spacy import displacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Define your text
text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in California on April 1, 1976."

# Tokenization: Break down the input text into individual words or tokens
tokens = nlp(text)

# Part-of-Speech (POS) Tagging: Assign grammatical roles to each token
pos_tags = [(token.text, token.pos_) for token in tokens]

# Named Entity Recognition: Identify named entities in the text
named_entities = [(entity.text, entity.label_) for entity in tokens.ents]

# Entity Classification: Classify named entities into predefined categories
entity_categories = [(entity.text, entity.label_) for entity in tokens.ents]

# Visualize Named Entities
displacy.render(tokens, style="ent", jupyter=True)


In [35]:
   
content = "Apple Inc. is a multinational technology company headquartered in Cupertino, California, USA. Founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne, Apple has grown to become one of the world's leading innovators in consumer electronics, software, and services. "
 
doc = nlp(content)

from spacy import displacy
displacy.render(doc, style="ent")



In [34]:
import spacy
from spacy import displacy
from nltk.tokenize import word_tokenize

# Load the English language model
nlp = spacy.load("en_core_web_lg")

# Define your text
text = "Apple Inc. is a global technology company known for its innovative products like the iPhone, iPad, and Mac computers. Founded in 1976 by Steve Jobs, Steve Wozniak, and Ronald Wayne, Apple has a reputation for sleek design, seamless integration of hardware and software, and strong marketing. It offers a range of services such as iCloud, Apple Music, and Apple Pay, along with a commitment to environmental sustainability. Apple's impact on technology and culture is profound, shaping the way we interact with devices and consume media."

# Tokenize the text using NLTK
word_tokens = word_tokenize(text)

# Join the tokens back into a single string
text = " ".join(word_tokens)

# Process the text with spaCy
doc = nlp(text)

# Render the entities with words using displacy
displacy.render(doc, style="ent")


In [37]:
import spacy
from spacy import displacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Your text
text = "Named Entity Recognition"

# Process the text
doc = nlp(text)

# Define custom color scheme
colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}

# Visualize named entities with custom colors
options = {"ents": ["ORG"], "colors": colors}
displacy.render(doc, style="ent", jupyter=True, options=options)


In [3]:
import numpy as np
import pandas as pd

data = pd.read_csv("employees.csv")

data.head(5)



Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,198,Donald,OConnell,DOCONNEL,650.507.9833,21-JUN-07,SH_CLERK,2600,-,124,50
1,199,Douglas,Grant,DGRANT,650.507.9844,13-JAN-08,SH_CLERK,2600,-,124,50
2,200,Jennifer,Whalen,JWHALEN,515.123.4444,17-SEP-03,AD_ASST,4400,-,101,10
3,201,Michael,Hartstein,MHARTSTE,515.123.5555,17-FEB-04,MK_MAN,13000,-,100,20
4,202,Pat,Fay,PFAY,603.123.6666,17-AUG-05,MK_REP,6000,-,201,20


In [10]:
print(data["EMAIL"].unique())
print(data["EMPLOYEE_ID"].unique())

['DOCONNEL' 'DGRANT' 'JWHALEN' 'MHARTSTE' 'PFAY' 'SMAVRIS' 'HBAER'
 'SHIGGINS' 'WGIETZ' 'SKING' 'NKOCHHAR' 'LDEHAAN' 'AHUNOLD' 'BERNST'
 'DAUSTIN' 'VPATABAL' 'DLORENTZ' 'NGREENBE' 'DFAVIET' 'JCHEN' 'ISCIARRA'
 'JMURMAN' 'LPOPP' 'DRAPHEAL' 'AKHOO' 'SBAIDA' 'STOBIAS' 'GHIMURO'
 'KCOLMENA' 'MWEISS' 'AFRIPP' 'PKAUFLIN' 'SVOLLMAN' 'KMOURGOS' 'JNAYER'
 'IMIKKILI' 'JLANDRY' 'SMARKLE' 'LBISSOT' 'MATKINSO' 'JAMRLOW' 'TJOLSON'
 'JMALLIN' 'MROGERS' 'KGEE' 'HPHILTAN' 'RLADWIG' 'SSTILES' 'JSEO' 'JPATEL']
[198 199 200 201 202 203 204 205 206 100 101 102 103 104 105 106 107 108
 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
 127 128 129 130 131 132 133 134 135 136 137 138 139 140]


In [11]:
data["EMPLOYEE_ID"].value_counts(5)

EMPLOYEE_ID
198    0.02
128    0.02
118    0.02
119    0.02
120    0.02
121    0.02
122    0.02
123    0.02
124    0.02
125    0.02
126    0.02
127    0.02
129    0.02
199    0.02
130    0.02
131    0.02
132    0.02
133    0.02
134    0.02
135    0.02
136    0.02
137    0.02
138    0.02
139    0.02
117    0.02
116    0.02
115    0.02
114    0.02
200    0.02
201    0.02
202    0.02
203    0.02
204    0.02
205    0.02
206    0.02
100    0.02
101    0.02
102    0.02
103    0.02
104    0.02
105    0.02
106    0.02
107    0.02
108    0.02
109    0.02
110    0.02
111    0.02
112    0.02
113    0.02
140    0.02
Name: proportion, dtype: float64

In [12]:
one_hot = pd.get_dummies(data, columns=["FIRST_NAME","EMPLOYEE_ID"])
print(one_hot)

      LAST_NAME     EMAIL  PHONE_NUMBER  HIRE_DATE      JOB_ID  SALARY  \
0      OConnell  DOCONNEL  650.507.9833  21-JUN-07    SH_CLERK    2600   
1         Grant    DGRANT  650.507.9844  13-JAN-08    SH_CLERK    2600   
2        Whalen   JWHALEN  515.123.4444  17-SEP-03     AD_ASST    4400   
3     Hartstein  MHARTSTE  515.123.5555  17-FEB-04      MK_MAN   13000   
4           Fay      PFAY  603.123.6666  17-AUG-05      MK_REP    6000   
5        Mavris   SMAVRIS  515.123.7777  07-JUN-02      HR_REP    6500   
6          Baer     HBAER  515.123.8888  07-JUN-02      PR_REP   10000   
7       Higgins  SHIGGINS  515.123.8080  07-JUN-02      AC_MGR   12008   
8         Gietz    WGIETZ  515.123.8181  07-JUN-02  AC_ACCOUNT    8300   
9          King     SKING  515.123.4567  17-JUN-03     AD_PRES   24000   
10      Kochhar  NKOCHHAR  515.123.4568  21-SEP-05       AD_VP   17000   
11      De Haan   LDEHAAN  515.123.4569  13-JAN-01       AD_VP   17000   
12       Hunold   AHUNOLD  590.423.456

In [13]:
import pandas as pd

data = {'Color': ['Red', 'Blue', 'Green', 'Red', 'Green']}

df = pd.DataFrame(data)

one_hot = pd.get_dummies(df["Color"])

print(one_hot)

    Blue  Green    Red
0  False  False   True
1   True  False  False
2  False   True  False
3  False  False   True
4  False   True  False


In [10]:
import pandas as pd

data = {'Fruit': ['Apple', 'Orange', 'Banana', 'Apple',"Mango"]}

df = pd.DataFrame(data)

ohm = pd.get_dummies(df,columns=["Fruit"],dtype=int)
print(ohm)

   Fruit_Apple  Fruit_Banana  Fruit_Mango  Fruit_Orange
0            1             0            0             0
1            0             0            0             1
2            0             1            0             0
3            1             0            0             0
4            0             0            1             0


In [12]:
import pandas as pd

data = {'Fruit': ['Apple', 'Orange', 'Banana', 'Apple',"Mango"]}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,Fruit
0,Apple
1,Orange
2,Banana
3,Apple
4,Mango


One Hot Encoding using Pandas Library

In [63]:
import pandas as pd

data = {'Fruit': ['Apple', 'Orange', 'Banana']}

df = pd.DataFrame(data)

df.head()

Unnamed: 0,Fruit
0,Apple
1,Orange
2,Banana


In [64]:
ohm = pd.get_dummies(df,columns=["Fruit"],dtype=int)
print(ohm)

   Fruit_Apple  Fruit_Banana  Fruit_Orange
0            1             0             0
1            0             0             1
2            0             1             0


In [26]:
import pandas as pd

fruits_data = pd.read_csv("fruits_dataset.csv")

df = pd.DataFrame(fruits_data)

df = df.drop(columns="Price")

ohm = pd.get_dummies(df,columns=["Fruits"],dtype=int)

print(ohm)

    Color  Fruits_Apple  Fruits_Banana  Fruits_Orange
0     Red             1              0              0
1  Orange             0              0              1
2  Yellow             0              1              0
3   Green             1              0              0
4  Yellow             0              1              0
5  Orange             0              0              1
6  Yellow             0              1              0


In [27]:
ohm_color = pd.get_dummies(df,columns=["Color"], dtype=int)
print(ohm_color)

   Fruits  Color_Green  Color_Orange  Color_Red  Color_Yellow
0   Apple            0             0          1             0
1  Orange            0             1          0             0
2  Banana            0             0          0             1
3   Apple            1             0          0             0
4  Banana            0             0          0             1
5  Orange            0             1          0             0
6  Banana            0             0          0             1


In [29]:
import pandas as pd

fruits_data = pd.read_csv("fruits_dataset.csv")

df = pd.DataFrame(fruits_data)

df = df.drop(columns="Price")

df.head()

Unnamed: 0,Fruits,Color
0,Apple,Red
1,Orange,Orange
2,Banana,Yellow
3,Apple,Green
4,Banana,Yellow


In [60]:
ohm = pd.get_dummies(df,columns=["Fruits","Color"],dtype=int)

print(ohm)

KeyError: "None of [Index(['Fruits', 'Color'], dtype='object')] are in the [columns]"

One Hot Encoding using Pandas Library

In [4]:
import pandas as pd

fruits_data = pd.read_csv("fruits_dataset.csv")

df = pd.DataFrame(fruits_data)

df.head()


Unnamed: 0,Fruits,Color,Price
0,Apple,Red,0.75
1,Orange,Orange,0.6
2,Banana,Yellow,0.41
3,Apple,Green,0.9
4,Banana,Yellow,0.5


In [2]:
df = df.drop(columns="Price")

In [3]:
df.head()

Unnamed: 0,Fruits,Color
0,Apple,Red
1,Orange,Orange
2,Banana,Yellow
3,Apple,Green
4,Banana,Yellow


In [8]:
one_hot_encode = pd.get_dummies(df,columns=["Color"],dtype=int)
print(one_hot_encode)

   Fruits  Color_Green  Color_Orange  Color_Red  Color_Yellow
0   Apple            0             0          1             0
1  Orange            0             1          0             0
2  Banana            0             0          0             1
3   Apple            1             0          0             0
4  Banana            0             0          0             1
5  Orange            0             1          0             0
6  Banana            0             0          0             1


In [45]:
import pandas as pd

fruits_data = pd.read_csv("fruits_dataset.csv")

df = pd.DataFrame(fruits_data)

df = df.drop(columns="Price")

ohm = pd.get_dummies(df,columns=["Color"],dtype=int)

print(ohm)

   Fruits  Color_Green  Color_Orange  Color_Red  Color_Yellow
0   Apple            0             0          1             0
1  Orange            0             1          0             0
2  Banana            0             0          0             1
3   Apple            1             0          0             0
4  Banana            0             0          0             1
5  Orange            0             1          0             0
6  Banana            0             0          0             1


One Hot Encoding using Sci-kit Learn Library

In [36]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("fruits_dataset.csv")

encoder = OneHotEncoder()

# Fit and transform the 'Fruits' column
ohm = encoder.fit_transform(data[['Fruits']])

# Create a DataFrame from the one-hot encoded data
one_hot = pd.DataFrame(ohm.toarray(), columns=encoder.get_feature_names_out(['Fruits']), dtype=int)

print("Original Data:")
print(data)

print("\nEncoded Data:")
print(one_hot)


Original Data:
   Fruits   Color  Price
0   Apple     Red   0.75
1  Orange  Orange   0.60
2  Banana  Yellow   0.41
3   Apple   Green   0.90
4  Banana  Yellow   0.50
5  Orange  Orange   0.65
6  Banana  Yellow   0.45

Encoded Data:
   Fruits_Apple  Fruits_Banana  Fruits_Orange
0             1              0              0
1             0              0              1
2             0              1              0
3             1              0              0
4             0              1              0
5             0              0              1
6             0              1              0


In [31]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("fruits_dataset.csv")

encoder = OneHotEncoder()

one_hot_encode = encoder.fit_transform(data[["Color"]])

one_hot_df = pd.DataFrame(one_hot_encode.toarray(),columns=encoder.get_feature_names_out(["Color"]), dtype=int)

result = pd.concat([data["Fruits"], one_hot_df],axis=1)

print(result)

   Fruits  Color_Green  Color_Orange  Color_Red  Color_Yellow
0   Apple            0             0          1             0
1  Orange            0             1          0             0
2  Banana            0             0          0             1
3   Apple            1             0          0             0
4  Banana            0             0          0             1
5  Orange            0             1          0             0
6  Banana            0             0          0             1


In [44]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

data = pd.read_csv("fruits_dataset.csv")


encoder = OneHotEncoder()

ohm = encoder.fit_transform(data[["Color"]])

ohm_df = pd.DataFrame(ohm.toarray(), columns=encoder.get_feature_names_out(["Color"]), dtype=int)

result = pd.concat([data["Fruits"], ohm_df], axis=1)


print(result)


Original Fruits Data
   Fruits   Color  Price
0   Apple     Red   0.75
1  Orange  Orange   0.60
2  Banana  Yellow   0.41
3   Apple   Green   0.90
4  Banana  Yellow   0.50
5  Orange  Orange   0.65
6  Banana  Yellow   0.45

Encoded Fruits Data
   Fruits  Color_Green  Color_Orange  Color_Red  Color_Yellow
0   Apple            0             0          1             0
1  Orange            0             1          0             0
2  Banana            0             0          0             1
3   Apple            1             0          0             0
4  Banana            0             0          0             1
5  Orange            0             1          0             0
6  Banana            0             0          0             1


In [None]:
print("Original Fruits Data")
print(data)

print("\nEncoded Fruits Data")

In [67]:

import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load the fruits dataset from the CSV file
data = pd.read_csv("fruits_dataset.csv")

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the 'Color' column
ohm = encoder.fit_transform(data[["Color"]])

# Create a DataFrame from the one-hot encoded data
ohm_df = pd.DataFrame(ohm.toarray(), columns=encoder.get_feature_names_out(["Color"]), dtype=int)

# Concatenate the 'Fruits' column with the one-hot encoded DataFrame
result = pd.concat([data["Fruits"], ohm_df], axis=1)

# Print the resulting DataFrame
print(result)


   Fruits  Color_Green  Color_Orange  Color_Red  Color_Yellow
0   Apple            0             0          1             0
1  Orange            0             1          0             0
2  Banana            0             0          0             1
3   Apple            1             0          0             0
4  Banana            0             0          0             1
5  Orange            0             1          0             0
6  Banana            0             0          0             1


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The lazy dog sleeps peacefully.",
    "The quick brown fox and the lazy dog are friends."
]

# Step 2: Initialize CountVectorizer
vectorizer = CountVectorizer()

# Step 3: Tokenization and vocabulary creation
# Fit the CountVectorizer to learn the vocabulary and transform documents into count vectors
count_vectors = vectorizer.fit_transform(documents)

# Step 4: Access the vocabulary
vocabulary = vectorizer.get_feature_names_out()

# Step 5: Print count vectors and vocabulary
print("Document")
print(documents)
print("\nCount Vectors:")
print(count_vectors.toarray())
print("\nVocabulary:")
print(vocabulary)


Document
['The quick brown fox jumps over the lazy dog.', 'The lazy dog sleeps peacefully.', 'The quick brown fox and the lazy dog are friends.']

Count Vectors:
[[0 0 1 1 1 0 1 1 1 0 1 0 2]
 [0 0 0 1 0 0 0 1 0 1 0 1 1]
 [1 1 1 1 1 1 0 1 0 0 1 0 2]]

Vocabulary:
['and' 'are' 'brown' 'dog' 'fox' 'friends' 'jumps' 'lazy' 'over'
 'peacefully' 'quick' 'sleeps' 'the']


In [2]:
import nltk
import re
import numpy as np

text = """Beans. I was trying to explain to somebody as we were flying in, that’s corn. That’s beans. And they were very impressed at my agricultural knowledge. Please give it up for Amaury once again for that outstanding introduction. I have a bunch of good friends here today, including somebody who I served with, who is one of the finest senators in the country, and we’re lucky to have him, your Senator, Dick Durbin is here. I also noticed, by the way, former Governor Edgar here, who I haven’t seen in a long time, and somehow he has not aged and I have. And it’s great to see you, Governor. I want to thank President Killeen and everybody at the U of I System for making it possible for me to be here today. And I am deeply honored at the Paul Douglas Award that is being given to me. He is somebody who set the path for so much outstanding public service here in Illinois. Now, I want to start by addressing the elephant in the room. I know people are still wondering why I didn’t speak at the commencement."""

dataset = nltk.sent_tokenize(text)

for i in range(len(dataset)):
  dataset[i] = dataset[i].lower()
  dataset[i] = re.sub(r"\w"," ",dataset[i])
  dataset[i] = re.sub(r"\s+", " ",dataset[i]) 


word2count = {} 
for data in dataset: 
    words = nltk.word_tokenize(data) 
    for word in words: 
        if word not in word2count.keys(): 
            word2count[word] = 1
        else: 
            word2count[word] += 1

import heapq 
freq_words = heapq.nlargest(100, word2count, key=word2count.get)

X = [] 
for data in dataset: 
    vector = [] 
    for word in freq_words: 
        if word in nltk.word_tokenize(data): 
            vector.append(1) 
        else: 
            vector.append(0) 
    X.append(vector) 
X = np.asarray(X) 

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = [
    "The quick brown fox jumps over the lazy dog.",
    "The lazy dog sleeps peacefully.",
    "The quick brown fox and the lazy dog are friends."
]

# Step 1: Initialize CountVectorizer
vectorizer = CountVectorizer()

# Step 2: Learn the vocabulary and transform documents into count vectors
count_vectors = vectorizer.fit_transform(documents)

# Step 3: Access the vocabulary
vocabulary = vectorizer.get_feature_names_out()

# Step 4: Print the count vectors and vocabulary
print("Count Vectors:")
print(count_vectors.toarray())
print("\nVocabulary:")
print(vocabulary)


Count Vectors:
[[0 0 1 1 1 0 1 1 1 0 1 0 2]
 [0 0 0 1 0 0 0 1 0 1 0 1 1]
 [1 1 1 1 1 1 0 1 0 0 1 0 2]]

Vocabulary:
['and' 'are' 'brown' 'dog' 'fox' 'friends' 'jumps' 'lazy' 'over'
 'peacefully' 'quick' 'sleeps' 'the']


In [8]:
import nltk
from nltk.tokenize import word_tokenize

# Download the necessary resources for word tokenization
# nltk.download('punkt')

# Example sentence
sentence = "The cat is on the mat!"

# Tokenize the sentence into words
tokens = word_tokenize(sentence)

print(tokens)


['The', 'cat', 'is', 'on', 'the', 'mat', '!']


Word   Index
!:   0
.:   1
The:   2
cat:   3
dog:   4
is:   5
mat:   6
on:   7
table:   8
the:   9
under:   10


In [11]:
def create_word_count_vectors(corpus, vocabulary, word_to_index):
    word_count_vectors = []
    # Iterate through each document in the corpus
    for document in corpus:
        # Initialize a vector with zeros for each word in the vocabulary
        word_count_vector = [0] * len(vocabulary)
        # Tokenize the document into words
        tokens = word_tokenize(document)
        # Count the occurrence of each word
        for token in tokens:
            if token in vocabulary:
                word_index = word_to_index[token]
                word_count_vector[word_index] += 1
        # Add the word count vector to the list
        word_count_vectors.append(word_count_vector)
    return word_count_vectors

# Example corpus
corpus = [
    "The cat is on the mat!",
    "The dog is under the table."
]

# Create the vocabulary
vocabulary = create_vocabulary(corpus)
# Create a word-to-index mapping
word_to_index = {word: idx for idx, word in enumerate(vocabulary)}

# Create word count vectors for each document
word_count_vectors = create_word_count_vectors(corpus, vocabulary, word_to_index)

# Print the word count vectors
print(corpus)
print("Word Count Vectors:")
for i, vector in enumerate(word_count_vectors):
    print(f"Document {i + 1}: {vector}")


['The cat is on the mat!', 'The dog is under the table.']
Word Count Vectors:
Document 1: [1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0]
Document 2: [0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1]


In [31]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer

# Download NLTK resources if not already downloaded
# nltk.download('punkt')
# nltk.download('stopwords')

# Sample documents
documents = ["The quick brown fox jumps over the lazy dog.", "The lazy dog sleeps peacefully."]

# Tokenization and remove stopwords
stop_words = set(stopwords.words('english'))
bag_of_words = []

for doc in documents:
    tokens = word_tokenize(doc)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    word_freq = Counter(filtered_tokens)
    bag_of_words.append(word_freq)

# Convert bag_of_words into a matrix format
# This matrix represents the bag-of-words representation
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents)
bag_of_words_matrix = tokenizer.texts_to_matrix(documents, mode='count')

print("Bag of Words Matrix:")
print(bag_of_words_matrix)


Bag of Words Matrix:
[[0. 2. 1. 1. 1. 1. 1. 1. 1. 0. 0.]
 [0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1.]]


In [32]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

# Download NLTK resources if not already downloaded
# nltk.download('punkt')
# nltk.download('stopwords')

def create_word_count_vectors(corpus, tokenizer):
    word_count_vectors = []
    # Convert the corpus into sequences of word indices
    sequences = tokenizer.texts_to_sequences(corpus)
    # Iterate through each sequence
    for seq in sequences:
        # Initialize a vector with zeros for each unique word index
        word_count_vector = [0] * (len(tokenizer.word_index) + 1)
        # Count the occurrence of each word index
        for idx in seq:
            word_count_vector[idx] += 1
        # Add the word count vector to the list
        word_count_vectors.append(word_count_vector)
    return word_count_vectors

# Sample corpus
corpus = [
    "The cat is on the mat!",
    "The dog is under the table."
]

# Initialize the Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

# Create word count vectors for each document
word_count_vectors = create_word_count_vectors(corpus, tokenizer)

# Print the word count vectors
print("Corpus:")
print(corpus)
print("Word Count Vectors:")
for i, vector in enumerate(word_count_vectors):
    print(f"Document {i + 1}: {vector}")


Corpus:
['The cat is on the mat!', 'The dog is under the table.']
Word Count Vectors:
Document 1: [0, 2, 1, 1, 1, 1, 0, 0, 0]
Document 2: [0, 2, 1, 0, 0, 0, 1, 1, 1]


In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer

docs = [
  'The cat is sleeping on the mat.',
  'The dog is running in the garden.'
]

## Step 1: Determine the Vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
print(f'{list(tokenizer.word_index.keys())}')

## Step 2: Count
vectors = tokenizer.texts_to_matrix(docs, mode='count')
print(vectors)

['the', 'is', 'cat', 'sleeping', 'on', 'mat', 'dog', 'running', 'in', 'garden']
[[0. 2. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 2. 1. 0. 0. 0. 0. 1. 1. 1. 1.]]


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download the stopwords corpus (only needed once)
# nltk.download('stopwords')

# Define the documents
documents = [
    "The cat is sleeping on the mat.",
    "The dog is running in the garden.",
    
]

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Tokenize each document and remove stopwords
words_without_stopwords = []
for document in documents:
    # Tokenize the document into words
    words = word_tokenize(document)
    # Remove stopwords from the tokenized words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Add the filtered words to the list
    words_without_stopwords.extend(filtered_words)

# Display the remaining words after removing stopwords
print("Words without stopwords:")
print(words_without_stopwords)


Words without stopwords:
['cat', 'sleeping', 'mat', '.', 'dog', 'running', 'garden', '.']


In [22]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer

# Define the documents
documents = [
    "The cat is sleeping on the mat", 
    "The dog is running in the garden"
]

# Tokenize each document
word_tokens = [word_tokenize(document) for document in documents]

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_word_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in word_tokens]

# Combine the filtered word tokens into a single row
combined_tokens = [' '.join(tokens) for tokens in filtered_word_tokens]

# Create a Tokenizer instance and fit it on the combined tokens
tokenizer = Tokenizer(num_words=None, lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(combined_tokens)

# Convert combined tokens to sequences of word indices
word_index_sequences = tokenizer.texts_to_sequences(combined_tokens)

# Create Word Count Vectors
word_count_vectors = tokenizer.sequences_to_matrix(word_index_sequences, mode='count')

# Adjust indices to start from 1
word_count_vectors_shifted = word_count_vectors[:, 1:]

# Print the word count vectors
print("Word Count Vectors:")
print(combined_tokens)
print(word_count_vectors_shifted)


Word Count Vectors:
['cat sleeping mat', 'dog running garden']
[[1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1.]]


In [46]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer

# Define the documents
documents = [
    "The cat is sleeping on the mat", 
    "The dog is running in the garden"
]

# Tokenize each document
word_tokens = [word_tokenize(document) for document in documents]

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_word_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in word_tokens]

# Combine the filtered word tokens into a single row
combined_tokens = [' '.join(tokens) for tokens in filtered_word_tokens]

# Create a Tokenizer instance and fit it on the combined tokens
tokenizer = Tokenizer(num_words=None, lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(combined_tokens)

# Convert combined tokens to sequences of word indices
word_index_sequences = tokenizer.texts_to_sequences(combined_tokens)

# Create Word Count Vectors
word_count_vectors = tokenizer.sequences_to_matrix(word_index_sequences, mode='count')

# Adjust indices to start from 1
word_count_vectors_shifted = word_count_vectors[:, 1:]

# Print the word count vectors
print("Word Count Vectors:")
print(combined_tokens)
print(word_count_vectors_shifted)


Word Count Vectors:
['cat sleeping mat', 'dog running garden']
[[1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1.]]


In [32]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "The cat is sleeping on the mat", 
    "The dog is running in the garden"
]

bag_of_words = CountVectorizer(stop_words="english")

bag_of_words.fit(documents)

bag_of_words.get_feature_names_out()

bow_features = bag_of_words.transform(documents)

bow_features_array = bow_features.toarray()
bow_features_array

array([[1, 0, 0, 1, 0, 1],
       [0, 1, 1, 0, 1, 0]], dtype=int64)

In [40]:
print(bag_of_words.get_feature_names_out())
for sentence,feature in zip(documents,bow_features_array):
  print(sentence)
  print(feature)

['cat' 'dog' 'garden' 'mat' 'running' 'sleeping']
The cat is sleeping on the mat
[1 0 0 1 0 1]
The dog is running in the garden
[0 1 1 0 1 0]


In [42]:
from sklearn.feature_extraction.text import CountVectorizer

documents = [
    "The cat is sleeping on the mat", 
    "The dog is running in the garden"
]

bag_of_words = CountVectorizer(stop_words="english")

bag_of_words.fit(documents)

bag_of_words.get_feature_names_out()

bow_features = bag_of_words.transform(documents)

bow_features_array = bow_features.toarray()
bow_features_array

print(bag_of_words.get_feature_names_out())
for sentence,feature in zip(documents,bow_features_array):
  print(sentence)
  print(feature)

['cat' 'dog' 'garden' 'mat' 'running' 'sleeping']
The cat is sleeping on the mat
[1 0 0 1 0 1]
The dog is running in the garden
[0 1 1 0 1 0]


In [41]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import CountVectorizer

# Define the documents
documents = [
    "The cat is sleeping on the mat", 
    "The dog is running in the garden"
]

# Tokenize each document
word_tokens = [word_tokenize(document) for document in documents]

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_word_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in word_tokens]

# Combine the filtered word tokens into a single row
combined_tokens = [' '.join(tokens) for tokens in filtered_word_tokens]

# Create a Tokenizer instance and fit it on the combined tokens
tokenizer = Tokenizer(num_words=None, lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(combined_tokens)

# Convert combined tokens to sequences of word indices
word_index_sequences = tokenizer.texts_to_sequences(combined_tokens)

# Create Word Count Vectors using the Tokenizer
word_count_vectors = tokenizer.sequences_to_matrix(word_index_sequences, mode='count')

# Adjust indices to start from 1
word_count_vectors_shifted = word_count_vectors[:, 1:]

# Print the word count vectors using Tokenizer
print("Word Count Vectors using Tokenizer:")
print(combined_tokens)
print(word_count_vectors_shifted)

# Create Word Count Vectors using CountVectorizer
bag_of_words = CountVectorizer(stop_words="english")
bow_features = bag_of_words.fit_transform(combined_tokens)
bow_features_array = bow_features.toarray()

# Print the word count vectors using CountVectorizer
print("\nWord Count Vectors using CountVectorizer:")
print(bag_of_words.get_feature_names_out())
for sentence, feature in zip(documents, bow_features_array):
    print(sentence)
    print(feature)


Word Count Vectors using Tokenizer:
['cat sleeping mat', 'dog running garden']
[[1. 1. 1. 0. 0. 0.]
 [0. 0. 0. 1. 1. 1.]]

Word Count Vectors using CountVectorizer:
['cat' 'dog' 'garden' 'mat' 'running' 'sleeping']
The cat is sleeping on the mat
[1 0 0 1 0 1]
The dog is running in the garden
[0 1 1 0 1 0]


In [45]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

documents = [
    "The cat is sleeping on the mat", 
    "The dog is running in the garden"
]

bag_of_words = CountVectorizer(stop_words="english")

bag_of_words.fit(documents)

feature_names = bag_of_words.get_feature_names_out()

bow_features = bag_of_words.transform(documents)

bow_features_array = bow_features.toarray()

desired_features_order = ['cat', 'dog', 'garden', 'mat', 'running', 'sleeping']

print(desired_features_order)

for sentence, feature in zip(documents, bow_features_array):
    print(sentence)
    print([feature[np.where(feature_names == word)[0][0]] if word in feature_names else 0 for word in desired_features_order])


['cat', 'dog', 'garden', 'mat', 'running', 'sleeping']
The cat is sleeping on the mat
[1, 0, 0, 1, 0, 1]
The dog is running in the garden
[0, 1, 1, 0, 1, 0]


In [51]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer

# Define the documents
documents = [
    "The cat is sleeping on the mat", 
    "The dog is running in the garden"
]

# Tokenize each document
word_tokens = [word_tokenize(document) for document in documents]

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_word_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in word_tokens]

# Create a Tokenizer instance and fit it on the filtered tokens
tokenizer = Tokenizer(num_words=None, lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(filtered_word_tokens)

# Convert filtered tokens to sequences of word indices
word_index_sequences = tokenizer.texts_to_sequences(filtered_word_tokens)

# Create Word Count Vectors
word_count_vectors = tokenizer.sequences_to_matrix(word_index_sequences, mode='count')

# Adjust indices to start from 1
word_count_vectors_shifted = word_count_vectors[:, 1:]

# Get the feature names
feature_names = tokenizer.index_word.values()

# Print the feature names
print(list(feature_names))

# Print the word count vectors
print(documents)
for document, vector in zip(documents, word_count_vectors_shifted):
    print(document)
    print(vector)


['cat', 'sleeping', 'mat', 'dog', 'running', 'garden']
['The cat is sleeping on the mat\n', 'The dog is running in the garden']
The cat is sleeping on the mat

[1. 1. 1. 0. 0. 0.]
The dog is running in the garden
[0. 0. 0. 1. 1. 1.]


In [66]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer

# Define the documents
document1 = "The cat is sleeping on the mat"
document2 = "The dog is running in the garden"

# Combine the documents into a list
documents = [document1, document2]

# Tokenize each document
word_tokens = [word_tokenize(document) for document in documents]

# Get the list of English stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_word_tokens = [[word for word in tokens if word.lower() not in stop_words] for tokens in word_tokens]

# Create a Tokenizer instance and fit it on the filtered tokens
tokenizer = Tokenizer(num_words=None, lower=True, split=' ', char_level=False, oov_token=None)
tokenizer.fit_on_texts(filtered_word_tokens)

# Convert filtered tokens to sequences of word indices
word_index_sequences = tokenizer.texts_to_sequences(filtered_word_tokens)

# Create Word Count Vectors
word_count_vectors = tokenizer.sequences_to_matrix(word_index_sequences, mode='count')

# Adjust indices to start from 1
word_count_vectors_shifted = word_count_vectors[:, 1:]

# Get the feature names
feature_names = tokenizer.index_word.values()

# Print the feature names
print(list(feature_names))

# Print the word count vectors
# print(documents)
for document, vector in zip(documents, word_count_vectors_shifted):
    print(vector)


['cat', 'sleeping', 'mat', 'dog', 'running', 'garden']
[1. 1. 1. 0. 0. 0.]
[0. 0. 0. 1. 1. 1.]


In [57]:
from tensorflow.keras.preprocessing.text import Tokenizer

document1 = "The cat is sleeping on the mat"
document2 = "The dog is running in the garden"

## Step 1: Determine the Vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(docs)
print(f'Vocabulary: {list(tokenizer.word_index.keys())}')

## Step 2: Count
vectors = tokenizer.texts_to_matrix(docs, mode='count')
print(vectors)

Vocabulary: ['the', 'cat', 'sat', 'hat', 'in', 'with']
[[0. 1. 1. 1. 0. 0. 0.]
 [0. 2. 1. 1. 1. 1. 0.]
 [0. 2. 1. 0. 1. 0. 1.]]


In [60]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Define the documents
document1 = "The cat is sleeping on the mat"
document2 = "The dog is running in the garden"

# Step 1: Determine the Vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts([document1, document2])
print(f'{list(tokenizer.word_index.keys())}')

# Step 2: Count
vectors = tokenizer.texts_to_matrix([document1, document2], mode='count')
print(document1)
print(vectors)


['the', 'is', 'cat', 'sleeping', 'on', 'mat', 'dog', 'running', 'in', 'garden']
The cat is sleeping on the mat
[[0. 2. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 2. 1. 0. 0. 0. 0. 1. 1. 1. 1.]]


In [62]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Define the documents
documents = [
    "The cat is sleeping on the mat", 
    "The dog is running in the garden"
]

# Step 1: Determine the Vocabulary
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents)
print(f'Vocabulary: {list(tokenizer.word_index.keys())}')

# Step 2: Count
vectors = tokenizer.texts_to_matrix(documents, mode='count')
print(document)
print(vectors)


Vocabulary: ['the', 'is', 'cat', 'sleeping', 'on', 'mat', 'dog', 'running', 'in', 'garden']
The dog is running in the garden
[[0. 2. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 2. 1. 0. 0. 0. 0. 1. 1. 1. 1.]]


In [77]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

# Define the documents
doc1 = 'Game of Thrones is an amazing tv series!'
doc2 = 'Game of Thrones is the best tv series!'
doc3 = 'Game of Thrones is so great'

# Define a function to calculate Bag of Words (BOW) using custom Python code
def calculateBOW(wordset, l_doc):
    tf_diz = dict.fromkeys(wordset, 0)
    for word in l_doc:
        tf_diz[word] = l_doc.count(word)
    return tf_diz

# Preprocess the documents and split into word lists
l_doc1 = re.sub(r"[^a-zA-Z0-9]", " ", doc1.lower()).split()
l_doc2 = re.sub(r"[^a-zA-Z0-9]", " ", doc2.lower()).split()
l_doc3 = re.sub(r"[^a-zA-Z0-9]", " ", doc3.lower()).split()

# Create a set of unique words from all documents
wordset = set(l_doc1).union(set(l_doc2)).union(set(l_doc3))

# Calculate BOW for each document using custom Python code
bow1 = calculateBOW(wordset, l_doc1)
bow2 = calculateBOW(wordset, l_doc2)
bow3 = calculateBOW(wordset, l_doc3)

# Create DataFrame to display BOW representations calculated using custom Python code
# df_bow = pd.DataFrame([bow1, bow2, bow3])
# print("Bag of Words using custom Python code:")
# print(df_bow)

# Use CountVectorizer from scikit-learn to calculate BOW representations
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([doc1, doc2, doc3])
df_bow_sklearn = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
print("\nBag of Words using scikit-learn CountVectorizer (with default parameters):")
print(df_bow_sklearn)

# Use CountVectorizer with stop words removal
vectorizer_sw = CountVectorizer(stop_words='english')
X_sw = vectorizer_sw.fit_transform([doc1, doc2, doc3])
df_bow_sklearn_sw = pd.DataFrame(X_sw.toarray(), columns=vectorizer_sw.get_feature_names_out())
print("\nBag of Words using scikit-learn CountVectorizer with stop words removal:")
print(df_bow_sklearn_sw)



Bag of Words using scikit-learn CountVectorizer (with default parameters):
   amazing  an  best  game  great  is  of  series  so  the  thrones  tv
0        1   1     0     1      0   1   1       1   0    0        1   1
1        0   0     1     1      0   1   1       1   0    1        1   1
2        0   0     0     1      1   1   1       0   1    0        1   0

Bag of Words using scikit-learn CountVectorizer with stop words removal:
   amazing  best  game  great  series  thrones  tv
0        1     0     1      0       1        1   1
1        0     1     1      0       1        1   1
2        0     0     1      1       0        1   0


In [88]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

# Define the documents
doc1 = 'The cat is sleeping on the mat'
doc2 = 'The dog is running in the garden'

# Use CountVectorizer from scikit-learn to calculate BOW representations
# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform([doc1, doc2])
# df_bow_sklearn = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# print("\nBag of Words using scikit-learn CountVectorizer (with default parameters):")
# print(document1)
# print(df_bow_sklearn)

# Use CountVectorizer with stop words removal
vectorizer_sw = CountVectorizer(stop_words='english')
X_sw = vectorizer_sw.fit_transform([doc1, doc2])
df_bow_sklearn_sw = pd.DataFrame(X_sw.toarray(), columns=vectorizer_sw.get_feature_names_out())
print("\nBag of Words using scikit-learn CountVectorizer with stop words removal: ")
print()
print(df_bow_sklearn_sw)



Bag of Words using scikit-learn CountVectorizer with stop words removal: 

   cat  dog  garden  mat  running  sleeping
0    1    0       0    1        0         1
1    0    1       1    0        1         0


#### Final Code

In [92]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer

# Define the documents
doc1 = 'The cat is sleeping on the mat'
doc2 = 'The dog is running in the garden'

# Display the vocabulary with stop words removed
print("\nVocabulary with stop words removed:")
print(vectorizer_sw.get_feature_names_out())

# Use CountVectorizer with stop words removal
vectorizer_sw = CountVectorizer(stop_words='english')
X_sw = vectorizer_sw.fit_transform([doc1, doc2])
df_bow_sklearn_sw = pd.DataFrame(X_sw.toarray(), columns=vectorizer_sw.get_feature_names_out())
print("\nBag of Words using scikit-learn CountVectorizer with stop words removal:")
print(df_bow_sklearn_sw)





Vocabulary with stop words removed:
['cat' 'dog' 'garden' 'mat' 'running' 'sleeping']

Bag of Words using scikit-learn CountVectorizer with stop words removal:
   cat  dog  garden  mat  running  sleeping
0    1    0       0    1        0         1
1    0    1       1    0        1         0


In [96]:
from sklearn.feature_extraction.text import CountVectorizer

# Define the documents
doc1 = "The cat is sleeping on the mat."
doc2 = "The dog is running in the garden."

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit the vectorizer on the documents and transform them into a bag of words representation
bag_of_words = vectorizer.fit_transform([doc1, doc2])

# Convert the bag of words representation to a DataFrame for better visualization
import pandas as pd
bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out(), index=['Doc1', 'Doc2'])

# Print the vocabulary
print("Vocabulary:")
# print(doc1)
# print(doc2)
print(vectorizer.get_feature_names_out())

# Print the bag of words representation
# print("\nBag of Words Representation:")
print(bow_df)


Vocabulary:
['cat' 'dog' 'garden' 'in' 'is' 'mat' 'on' 'running' 'sleeping' 'the']
      cat  dog  garden  in  is  mat  on  running  sleeping  the
Doc1    1    0       0   0   1    1   1        0         1    2
Doc2    0    1       1   1   1    0   0        1         0    2


In [105]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Define the documents
doc1 = "The cat is sleeping on the mat."
doc2 = "The dog is running in the garden."

# Create a CountVectorizer instance with stop words removed
vectorizer = CountVectorizer(stop_words='english')

# Fit the vectorizer on the documents and transform them into a bag of words representation
bag_of_words = vectorizer.fit_transform([doc1, doc2])

# Convert the bag of words representation to a DataFrame for better visualization

bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out(), index=['Doc1', 'Doc2'])

# Print the vocabulary
print("Vocabulary: ",vectorizer.get_feature_names_out())
print()

# Print the bag of words representation after removing stop words
print("\nBag of Words Representation after removing stop words:\n")
print(bow_df)

Vocabulary:  ['cat' 'dog' 'garden' 'mat' 'running' 'sleeping']


Bag of Words Representation after removing stop words:

      cat  dog  garden  mat  running  sleeping
Doc1    1    0       0    1        0         1
Doc2    0    1       1    0        1         0


In [109]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Define the documents
doc1 = "The cat is sleeping on the mat."
doc2 = "The dog is running in the garden."

# Create a CountVectorizer instance with stop words removed
vectorizer = CountVectorizer(stop_words='english')

# Fit the vectorizer on the documents and transform them into a bag of words representation
bag_of_words = vectorizer.fit_transform([doc1, doc2])

# Convert the bag of words representation to a DataFrame for better visualization
bow_df = pd.DataFrame(bag_of_words.toarray(), columns=vectorizer.get_feature_names_out(), index=['Doc1', 'Doc2'])

# Print the vocabulary
print("Vocabulary: ", vectorizer.get_feature_names_out())
print()

# Print the bag of words representation after removing stop words
print("Bag of Words Representation after removing stop words:\n")
print(bow_df)


Vocabulary:  ['cat' 'dog' 'garden' 'mat' 'running' 'sleeping']

Bag of Words Representation after removing stop words:

      cat  dog  garden  mat  running  sleeping
Doc1    1    0       0    1        0         1
Doc2    0    1       1    0        1         0


#### TF-IDF

In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example documents
documents = [
    "The cat sat on the mat.",
    "The dog played in the yard.",
    "The cat and the dog are friends."
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents and transform them to TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names (terms)
terms = tfidf_vectorizer.get_feature_names_out()

# Print TF-IDF scores for each term in each document
for i, document in enumerate(documents):
    print(f"Document {i + 1}:")
    for j, term in enumerate(terms):
        tfidf_score = tfidf_matrix[i, j]
        if tfidf_score > 0:
            print(f"    {term}: {tfidf_score:.3f}")


Document 1:
    cat: 0.341
    mat: 0.448
    on: 0.448
    sat: 0.448
    the: 0.530
Document 2:
    dog: 0.341
    in: 0.448
    played: 0.448
    the: 0.530
    yard: 0.448
Document 3:
    and: 0.424
    are: 0.424
    cat: 0.323
    dog: 0.323
    friends: 0.424
    the: 0.501


In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Example documents
documents = [
    "The cat sat on the mat.",
    "The dog played in the yard.",
    "The cat and the dog are friends."
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents and transform them to TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get feature names (terms)
terms = tfidf_vectorizer.get_feature_names_out()

# Print TF-IDF scores for each term in each document
for i, document in enumerate(documents):
    print(f"Document {i + 1}:")
    for j, term in enumerate(terms):
        tfidf_score = tfidf_matrix[i, j]
        if tfidf_score > 0:
            print(f"    {term}: {tfidf_score:.3f}")


Document 1:
    cat: 0.341
    mat: 0.448
    on: 0.448
    sat: 0.448
    the: 0.530
Document 2:
    dog: 0.341
    in: 0.448
    played: 0.448
    the: 0.530
    yard: 0.448
Document 3:
    and: 0.424
    are: 0.424
    cat: 0.323
    dog: 0.323
    friends: 0.424
    the: 0.501


In [4]:
import pandas as pd
import numpy as np

documents = [
    "The cat sat on the mat.",
    "The dog played in the yard.",
    "The cat and the dog are friends."
]

words_set = set()

for doc in documents:
  words = doc.split(" ")
  words_set = words_set.union(set(words))

print("Number Of words in the Corpus : ",len(words_set))
print("The Words in the corpus: \n", words_set)




Number Of words in the Corpus :  13
The Words in the corpus: 
 {'played', 'dog', 'and', 'friends.', 'yard.', 'on', 'are', 'sat', 'The', 'cat', 'mat.', 'in', 'the'}


In [5]:
n_docs = len(documents)
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_set)

for i in range(n_docs):
  words = corpus[i].split(" ")
  for w in words:
    df_tf[w][i] = df_tf[w][i] + (1 / len(words))

df_tf

ValueError: columns cannot be a set

In [12]:
import pandas as pd
import numpy as np

documents = [
    "The cat sat on the mat.",
    "The dog played in the yard.",
    "The cat and the dog are friends."
]

words_set = set()

# Create a set of unique words in the corpus
for doc in documents:
    words = doc.split(" ")
    words_set = words_set.union(set(words))

print("Number of words in the Corpus:", len(words_set))
print("The words in the corpus:\n", words_set)

n_docs = len(documents)
n_words_set = len(words_set)

# Convert the set of words into a list
words_list = list(words_set)

# Initialize a DataFrame to store TF values
df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_list)

# Calculate TF for each term in each document
for i in range(n_docs):
    words = documents[i].split(" ")
    word_count = len(words)
    for word in words:
        df_tf.at[i, word] += 1 / word_count

print("\nTerm Frequency (TF) DataFrame:")
print(df_tf)

print("IDF of: ")

idf = {}

for w in words_set:
    k = 0    # number of documents in the corpus that contain this word
    
    for i in range(n_docs):
        if w in documents[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k)
    
    print(f'{w:>15}: {idf[w]:>10}' )

df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]
        
df_tf_idf

from sklearn.feature_extraction.text import TfidfVectorizer

tr_idf_model  = TfidfVectorizer()
tf_idf_vector = tr_idf_model.fit_transform(documents)

print(type(tf_idf_vector), tf_idf_vector.shape)

tf_idf_array = tf_idf_vector.toarray()

print(tf_idf_array)



Number of words in the Corpus: 13
The words in the corpus:
 {'played', 'dog', 'and', 'friends.', 'yard.', 'on', 'are', 'sat', 'The', 'cat', 'mat.', 'in', 'the'}

Term Frequency (TF) DataFrame:
     played       dog       and  friends.     yard.        on       are  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.166667  0.000000   
1  0.166667  0.166667  0.000000  0.000000  0.166667  0.000000  0.000000   
2  0.000000  0.142857  0.142857  0.142857  0.000000  0.000000  0.142857   

        sat       The       cat      mat.        in       the  
0  0.166667  0.166667  0.166667  0.166667  0.000000  0.166667  
1  0.000000  0.166667  0.000000  0.000000  0.166667  0.166667  
2  0.000000  0.142857  0.142857  0.000000  0.000000  0.142857  
IDF of: 
         played: 0.47712125471966244
            dog: 0.17609125905568124
            and: 0.47712125471966244
       friends.: 0.47712125471966244
          yard.: 0.47712125471966244
             on: 0.47712125471966244
            are: 0.

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_tf_idf[w][i] = df_tf[w][i] * idf[w]


In [8]:
import pandas as pd
import numpy as np
import math

documents = [
    "The cat sat on the mat.",
    "The dog played in the yard.",
    "The cat and the dog are friends."
]

words_set = set()

# Create a set of unique words in the corpus
for doc in documents:
    words = doc.split(" ")
    words_set = words_set.union(set(words))

print("Number of words in the Corpus:", len(words_set))
print("The words in the corpus:\n", words_set)

n_docs = len(documents)
n_words_set = len(words_set)

# Convert the set of words into a list
words_list = list(words_set)

# Initialize a DataFrame to store TF values
df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_list)

# Calculate TF for each term in each document
for i in range(n_docs):
    words = documents[i].split(" ")
    word_count = len(words)
    for word in words:
        df_tf.at[i, word] += 1 / word_count

print("\nTerm Frequency (TF) DataFrame:")
print(df_tf)

# Initialize a DataFrame to store IDF values
df_idf = pd.DataFrame(np.zeros((1, n_words_set)), columns=words_list)

# Calculate IDF for each term
for word in words_list:
    doc_count = sum([1 for doc in documents if word in doc])
    df_idf.at[0, word] = math.log(n_docs / (1 + doc_count))

print("\nInverse Document Frequency (IDF) DataFrame:")
print(df_idf)

# Calculate TF-IDF scores
df_tfidf = df_tf * df_idf.values

print("\nTF-IDF Scores DataFrame:")
print(df_tfidf)


Number of words in the Corpus: 13
The words in the corpus:
 {'played', 'dog', 'and', 'friends.', 'yard.', 'on', 'are', 'sat', 'The', 'cat', 'mat.', 'in', 'the'}

Term Frequency (TF) DataFrame:
     played       dog       and  friends.     yard.        on       are  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.166667  0.000000   
1  0.166667  0.166667  0.000000  0.000000  0.166667  0.000000  0.000000   
2  0.000000  0.142857  0.142857  0.142857  0.000000  0.000000  0.142857   

        sat       The       cat      mat.        in       the  
0  0.166667  0.166667  0.166667  0.166667  0.000000  0.166667  
1  0.000000  0.166667  0.000000  0.000000  0.166667  0.166667  
2  0.000000  0.142857  0.142857  0.000000  0.000000  0.142857  

Inverse Document Frequency (IDF) DataFrame:
     played  dog       and  friends.     yard.        on       are       sat  \
0  0.405465  0.0  0.405465  0.405465  0.405465  0.405465  0.405465  0.405465   

        The  cat      mat.        in       t

In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "The cat sat on the mat.",
    "The dog played in the yard.",
    "The cat and the dog are friends."
]

# Create a set of unique words in the corpus
words_set = set()
for doc in documents:
    words = doc.split(" ")
    words_set = words_set.union(set(words))

# Display the number of unique words and the words in the corpus
print("Number of words in the Corpus:", len(words_set))
print("The words in the corpus:\n", words_set)

n_docs = len(documents)
n_words_set = len(words_set)

# Convert the set of words into a list
words_list = list(words_set)

# Initialize a DataFrame to store Term Frequency (TF) values
df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_list)

# Calculate TF for each term in each document
for i in range(n_docs):
    words = documents[i].split(" ")
    word_count = len(words)
    for word in words:
        df_tf.at[i, word] += 1 / word_count

# Display the Term Frequency (TF) DataFrame
print("\nTerm Frequency (TF) DataFrame:")
print(df_tf)

# Initialize a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents and transform them into TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert TF-IDF vectors to an array for display
tfidf_array = tfidf_matrix.toarray()

# Display the TF-IDF array
print("\nTF-IDF Scores Array:")
print(tfidf_array)




Number of words in the Corpus: 13
The words in the corpus:
 {'played', 'dog', 'and', 'friends.', 'yard.', 'on', 'are', 'sat', 'The', 'cat', 'mat.', 'in', 'the'}

Term Frequency (TF) DataFrame:
     played       dog       and  friends.     yard.        on       are  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.166667  0.000000   
1  0.166667  0.166667  0.000000  0.000000  0.166667  0.000000  0.000000   
2  0.000000  0.142857  0.142857  0.142857  0.000000  0.000000  0.142857   

        sat       The       cat      mat.        in       the  
0  0.166667  0.166667  0.166667  0.166667  0.000000  0.166667  
1  0.000000  0.166667  0.000000  0.000000  0.166667  0.166667  
2  0.000000  0.142857  0.142857  0.000000  0.000000  0.142857  

TF-IDF Scores Array:
[[0.         0.         0.34101521 0.         0.         0.
  0.44839402 0.44839402 0.         0.44839402 0.52965746 0.        ]
 [0.         0.         0.         0.34101521 0.         0.44839402
  0.         0.         0.44839

In [24]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "The cat sat on the mat.",
    "The dog played in the yard.",
    "The cat and the dog are friends."
]

words_set = set()

# Create a set of unique words in the corpus
for doc in documents:
    words = doc.split(" ")
    words_set = words_set.union(set(words))

# Convert the set of words into a list
words_list = list(words_set)

print('Number of words in the corpus:', len(words_set))
print('The words in the corpus: \n', words_set)

n_docs = len(documents)         # Number of documents in the corpus
n_words_set = len(words_set)    # Number of unique words in the corpus

# Initialize DataFrame to store Term Frequency (TF) values
df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_list)

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = documents[i].split(' ')  # Words in the document
    for w in words:
        df_tf.at[i, w] += 1 / len(words)

print("\nTerm Frequency (TF) DataFrame:")
print(df_tf)

print("\nIDF of:")
idf = {}

# Compute Inverse Document Frequency (IDF)
for w in words_list:
    k = sum([1 for doc in documents if w in doc.split()])
    idf[w] = np.log10(n_docs / k)
    print(f'{w:>15}: {idf[w]:>10}')

# Initialize a TF-IDF DataFrame
df_tf_idf = df_tf.copy()

# # Compute TF-IDF scores manually
# for w in words_list:
#     for i in range(n_docs):
#         df_tf_idf.at[i, w] *= idf[w]

# print("\nTF-IDF DataFrame (Manually Calculated):")
# print(df_tf_idf)

# Use TfidfVectorizer to compute TF-IDF scores
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_array = tfidf_matrix.toarray()

words_set_sklearn = tfidf_vectorizer.get_feature_names_out()

df_tf_idf_sklearn = pd.DataFrame(tfidf_array, columns=words_set_sklearn)

print("\nTF-IDF DataFrame (Computed using TfidfVectorizer):")
print(df_tf_idf_sklearn)


Number of words in the corpus: 13
The words in the corpus: 
 {'played', 'dog', 'and', 'friends.', 'yard.', 'on', 'are', 'sat', 'The', 'cat', 'mat.', 'in', 'the'}

Term Frequency (TF) DataFrame:
     played       dog       and  friends.     yard.        on       are  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.166667  0.000000   
1  0.166667  0.166667  0.000000  0.000000  0.166667  0.000000  0.000000   
2  0.000000  0.142857  0.142857  0.142857  0.000000  0.000000  0.142857   

        sat       The       cat      mat.        in       the  
0  0.166667  0.166667  0.166667  0.166667  0.000000  0.166667  
1  0.000000  0.166667  0.000000  0.000000  0.166667  0.166667  
2  0.000000  0.142857  0.142857  0.000000  0.000000  0.142857  

IDF of:
         played: 0.47712125471966244
            dog: 0.17609125905568124
            and: 0.47712125471966244
       friends.: 0.47712125471966244
          yard.: 0.47712125471966244
             on: 0.47712125471966244
            are: 0

In [23]:
import pandas as pd
import numpy as np
import math

documents = [
    "The cat sat on the mat.",
    "The dog played in the yard.",
    "The cat and the dog are friends."
]

words_set = set()

# Create a set of unique words in the corpus
for doc in documents:
    words = doc.split(" ")
    words_set = words_set.union(set(words))

print("Number of words in the Corpus:", len(words_set))
print("The words in the corpus:\n", words_set)

n_docs = len(documents)
n_words_set = len(words_set)

# Convert the set of words into a list
words_list = list(words_set)

# Initialize a DataFrame to store TF values
df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_list)

# Calculate TF for each term in each document
for i in range(n_docs):
    words = documents[i].split(" ")
    word_count = len(words)
    for word in words:
        df_tf.at[i, word] += 1 / word_count

print("\nTerm Frequency (TF) DataFrame:")
print(df_tf)

# Initialize a DataFrame to store IDF values
df_idf = pd.DataFrame(np.zeros((1, n_words_set)), columns=words_list)

# Calculate IDF for each term
for word in words_list:
    doc_count = sum([1 for doc in documents if word in doc])
    df_idf.at[0, word] = math.log(n_docs / (1 + doc_count))

print("\nInverse Document Frequency (IDF) DataFrame:")
print(df_idf)

# Calculate TF-IDF scores
df_tfidf = df_tf * df_idf.values

print("\nTF-IDF Scores DataFrame:")
print(df_tfidf)


Number of words in the Corpus: 13
The words in the corpus:
 {'played', 'dog', 'and', 'friends.', 'yard.', 'on', 'are', 'sat', 'The', 'cat', 'mat.', 'in', 'the'}

Term Frequency (TF) DataFrame:
     played       dog       and  friends.     yard.        on       are  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.166667  0.000000   
1  0.166667  0.166667  0.000000  0.000000  0.166667  0.000000  0.000000   
2  0.000000  0.142857  0.142857  0.142857  0.000000  0.000000  0.142857   

        sat       The       cat      mat.        in       the  
0  0.166667  0.166667  0.166667  0.166667  0.000000  0.166667  
1  0.000000  0.166667  0.000000  0.000000  0.166667  0.166667  
2  0.000000  0.142857  0.142857  0.000000  0.000000  0.142857  

Inverse Document Frequency (IDF) DataFrame:
     played  dog       and  friends.     yard.        on       are       sat  \
0  0.405465  0.0  0.405465  0.405465  0.405465  0.405465  0.405465  0.405465   

        The  cat      mat.        in       t

In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

documents = [
    "The cat sat on the mat.",
    "The dog played in the yard.",
    "The cat and the dog are friends."
]

words_set = set()

# Create a set of unique words in the corpus
for doc in documents:
    words = doc.split(" ")
    words_set = words_set.union(set(words))

# Convert the set of words into a list
words_list = list(words_set)

print('Number of words in the corpus:', len(words_set))
print('The words in the corpus: \n', words_set)

n_docs = len(documents)         # Number of documents in the corpus
n_words_set = len(words_set)    # Number of unique words in the corpus

# Initialize DataFrame to store Term Frequency (TF) values
df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=words_list)

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = documents[i].split(' ')  # Words in the document
    for w in words:
        df_tf.at[i, w] += 1 / len(words)

print("\nTerm Frequency (TF) DataFrame:")
print(df_tf)

print("\nIDF(Inverse Document Frequency) of:")
idf = {}

# Compute Inverse Document Frequency (IDF)
for w in words_list:
    k = sum([1 for doc in documents if w in doc.split()])
    idf[w] = np.log10(n_docs / k)
    print(f'{w:>15}: {idf[w]:>10}')

# Initialize a TF-IDF DataFrame
df_tf_idf = df_tf.copy()

# Use TfidfVectorizer to compute TF-IDF scores
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
tfidf_array = tfidf_matrix.toarray()

words_set_sklearn = tfidf_vectorizer.get_feature_names_out()

df_tf_idf_sklearn = pd.DataFrame(tfidf_array, columns=words_set_sklearn)

print("\nTF-IDF DataFrame (Computed using TfidfVectorizer):")
print(df_tf_idf_sklearn)


Number of words in the corpus: 13
The words in the corpus: 
 {'played', 'dog', 'and', 'friends.', 'yard.', 'on', 'are', 'sat', 'The', 'cat', 'mat.', 'in', 'the'}

Term Frequency (TF) DataFrame:
     played       dog       and  friends.     yard.        on       are  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.166667  0.000000   
1  0.166667  0.166667  0.000000  0.000000  0.166667  0.000000  0.000000   
2  0.000000  0.142857  0.142857  0.142857  0.000000  0.000000  0.142857   

        sat       The       cat      mat.        in       the  
0  0.166667  0.166667  0.166667  0.166667  0.000000  0.166667  
1  0.000000  0.166667  0.000000  0.000000  0.166667  0.166667  
2  0.000000  0.142857  0.142857  0.000000  0.000000  0.142857  

IDF(Inverse Document Frequency) of:
         played: 0.47712125471966244
            dog: 0.17609125905568124
            and: 0.47712125471966244
       friends.: 0.47712125471966244
          yard.: 0.47712125471966244
             on: 0.47712125

In [1]:
import nltk
from nltk.tokenize import sent_tokenize

text = "Natural language processing (NLP) is a fascinating field. It deals with how computers understand and interact with human language. Sentence tokenization is one of the basic tasks in NLP."

sentence = sent_tokenize(text)

print(sentence)

['Natural language processing (NLP) is a fascinating field.', 'It deals with how computers understand and interact with human language.', 'Sentence tokenization is one of the basic tasks in NLP.']


In [2]:
import nltk
from nltk.tokenize import word_tokenize

text = "Natural language processing (NLP) is a fascinating field. It deals with how computers understand and interact with human language. Sentence tokenization is one of the basic tasks in NLP."

word = word_tokenize(text)

print(word)

['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'fascinating', 'field', '.', 'It', 'deals', 'with', 'how', 'computers', 'understand', 'and', 'interact', 'with', 'human', 'language', '.', 'Sentence', 'tokenization', 'is', 'one', 'of', 'the', 'basic', 'tasks', 'in', 'NLP', '.']


In [3]:
from nltk.corpus import stopwords

stopwords.fileids()

['arabic',
 'azerbaijani',
 'basque',
 'bengali',
 'catalan',
 'chinese',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'greek',
 'hebrew',
 'hinglish',
 'hungarian',
 'indonesian',
 'italian',
 'kazakh',
 'nepali',
 'norwegian',
 'portuguese',
 'romanian',
 'russian',
 'slovene',
 'spanish',
 'swedish',
 'tajik',
 'turkish']

In [4]:
from nltk.stem import PorterStemmer

words = ["eating","eats","eaten","writing","writes","programming","programs","history","running", "cats", "jumped", "faster", "quickly"]

stemming = PorterStemmer()

for word in words:
  print(word+" ----> "+stemming.stem(word))

eating ----> eat
eats ----> eat
eaten ----> eaten
writing ----> write
writes ----> write
programming ----> program
programs ----> program
history ----> histori
running ----> run
cats ----> cat
jumped ----> jump
faster ----> faster
quickly ----> quickli


In [5]:
from nltk.stem import SnowballStemmer

stammer = SnowballStemmer("english")

words = ["eating","eats","eaten","writing","writes","programming","programs","history","running", "cats", "jumped", "faster", "quickly"]

for word in words:
  print(word+" ---> "+stammer.stem(word))


eating ---> eat
eats ---> eat
eaten ---> eaten
writing ---> write
writes ---> write
programming ---> program
programs ---> program
history ---> histori
running ---> run
cats ---> cat
jumped ---> jump
faster ---> faster
quickly ---> quick


In [6]:
from nltk.stem import RegexpStemmer

words = ["eating","eats","eaten","writing","writes","programming","programs","history","running", "cats", "jumped", "faster", "quickly"]

# Define stemming rules using regular expressions
pattern = r"(ing$|s$|ed$|er$|est$|ly$)"

regexp_stemmer = RegexpStemmer(pattern)

for word in words:
  print(word+" ---> "+regexp_stemmer.stem(word))

eating ---> eat
eats ---> eat
eaten ---> eaten
writing ---> writ
writes ---> write
programming ---> programm
programs ---> program
history ---> history
running ---> runn
cats ---> cat
jumped ---> jump
faster ---> fast
quickly ---> quick


In [7]:
from nltk.stem import WordNetLemmatizer

# Create an instance of WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Examples of words categorized as verbs
verbs = ["running", "went", "eating"]

# Examples of words categorized as adjectives
adjectives = ["better", "worst", "faster"]

# Examples of words categorized as adverbs
adverbs = ["quickly", "slowly", "hardly"]

# Lemmatize verbs
print("Lemmatized Verbs:")
for verb in verbs:
    lemma = lemmatizer.lemmatize(verb, pos='v')  # 'v' indicates verb
    print(f"Original: {verb}\t Lemma: {lemma}")

# Lemmatize adjectives
print("\nLemmatized Adjectives:")
for adjective in adjectives:
    lemma = lemmatizer.lemmatize(adjective, pos='a')  # 'a' indicates adjective
    print(f"Original: {adjective}\t Lemma: {lemma}")

# Lemmatize adverbs
print("\nLemmatized Adverbs:")
for adverb in adverbs:
    lemma = lemmatizer.lemmatize(adverb, pos='r')  # 'r' indicates adverb
    print(f"Original: {adverb}\t Lemma: {lemma}")

Lemmatized Verbs:
Original: running	 Lemma: run
Original: went	 Lemma: go
Original: eating	 Lemma: eat

Lemmatized Adjectives:
Original: better	 Lemma: good
Original: worst	 Lemma: bad
Original: faster	 Lemma: fast

Lemmatized Adverbs:
Original: quickly	 Lemma: quickly
Original: slowly	 Lemma: slowly
Original: hardly	 Lemma: hardly


In [2]:
import csv

class WordReplacer:
  def __init__(self,word_map):
    self.word_map = word_map

  def replace(self,word):
    return self.word_map.get(word,word)
  
replacer = WordReplacer({"bday" : "Birthday"})
replacer.replace("bday")

class CsvWordReplacer(WordReplacer):
    def __init__(self, fname):
        word_map = {}
        with open(fname, 'r') as file:
            reader = csv.DictReader(file)
            for row in reader:
                word = row['Word']
                synonyms = row['Synonyms']
                word_map[word] = synonyms
        super().__init__(word_map)

replacer = CsvWordReplacer("Synonyms.csv")

text = replacer.replace("bf")
text1 = replacer.replace("bday")
text2 = replacer.replace("pls")

print(text)
print(text1)
print(text2)


boyfriend
birthday
please
