<a href="https://colab.research.google.com/github/maxenlee/DataScienceToolBox/blob/main/NLTKProcessor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 5 NLTK

In [1]:
!pip install scikit-learn==1.4

Collecting scikit-learn==1.4
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.4.0


In [2]:
import pandas as pd
import os
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from textblob import TextBlob, Word


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import random
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Importing File

In [3]:
# Assuming 'url' is the path to your CSV file
csv_url = 'https://ddc-datascience.s3.amazonaws.com/Projects/Project.5-NLP/Data/NLP.csv'
df = pd.read_csv(csv_url)


In [4]:
df.shape

(42786, 3)

In [5]:
from urllib.parse import unquote
df['name'] = df['name'].apply(lambda x: unquote(x))

### Transforming the text in the text column


*   Tokenizing
*   removing punction with REGEX  
*   Removing stop words
*   lemmatizing
*   improved speed with memoization
*   caching lemmatization



In [8]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# Ensure that NLTK resources are downloaded (do this once)
# nltk.download('averaged_perceptron_tagger')
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

# Initialize the lemmatizer and stop words list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Cache for memoization
lemmatization_cache = {}

def lemmatize_with_cache(word):
    # If the word has already been lemmatized, return the result from the cache
    if word in lemmatization_cache:
        return lemmatization_cache[word]
    # Otherwise, lemmatize the word and store the result in the cache
    else:
        lemmatized_word = lemmatizer.lemmatize(word)
        lemmatization_cache[word] = lemmatized_word
        return lemmatized_word

def get_proper_nouns(text):
    """Extract proper nouns from a given text."""
    proper_nouns = []
    tagged_tokens = pos_tag(word_tokenize(text))
    for word, tag in tagged_tokens:
        if tag == 'NNP' or tag == 'NNPS':  # Proper nouns
            proper_nouns.append(word.lower())
    return set(proper_nouns)

def process_text(df,on,optional_dict = {}):
    processed_texts = []

    for index, row in df.iterrows():
        # Extract proper nouns from the 'name' column to exclude from the 'text'
        proper_nouns = get_proper_nouns(row[optional_dict])

        # Tokenize and remove punctuation
        tokens = word_tokenize(row[on])
        tokens = [re.sub(r'\W+', '', token) for token in tokens if re.sub(r'\W+', '', token)]

        # Remove stop words, proper nouns, and lemmatize
        tokens = [lemmatize_with_cache(token.lower()) for token in tokens if token.lower() not in stop_words and token.lower() not in proper_nouns]

        # Rejoin tokens into a single string
        processed_text = ' '.join(tokens)
        processed_texts.append(processed_text)

    # Return a new DataFrame with processed texts
    return pd.DataFrame({'text': processed_texts})


In [9]:
# Assuming 'df' is your DataFrame with a 'text' column
processed_df = process_text(df,on = 'text', optional_dict= 'name')


# TF-IDF


In [10]:
processed_df.head()

Unnamed: 0,text
0,born 10 october 1979 former australian rule fo...
1,j aka sandy graduated university chicago 1973 ...
2,singer harmonica player active canada blue sce...
3,born waidmannsfeld lower austria austria 18 ja...
4,henry krvits born 30 december 1974 tallinn bet...


In [11]:
df['text'] = processed_df['text']

In [12]:
df['text']

0        born 10 october 1979 former australian rule fo...
1        j aka sandy graduated university chicago 1973 ...
2        singer harmonica player active canada blue sce...
3        born waidmannsfeld lower austria austria 18 ja...
4        henry krvits born 30 december 1974 tallinn bet...
                               ...                        
42781    born july 8 1967 saitama prefecture japanese c...
42782    graham born 14 may 1960 retired professional f...
42783    lozano born 4 september 1959 cali colombian fo...
42784    faberg author faberg scholar switzerlanda grea...
42785    born february 24 1938 chief financial officer ...
Name: text, Length: 42786, dtype: object

In [13]:
og_size = sum(df['text'].apply(lambda x: len(x)))
og_size

59493851

In [14]:
processed_size = sum(processed_df['text'].apply(lambda x: len(x)))
processed_size

59493851

In [15]:
og_size - processed_size

0

In [16]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tfidf_vectorizer.fit_transform(processed_df['text'])

### Fitting K Nearest neighbors

In [17]:
nearest_neighbors = NearestNeighbors().fit(tf_idf_matrix)

In [18]:
np.random.seed(24)
page = random.randrange(0,len(df))
wiki = tf_idf_matrix[page]


In [19]:
page

36059

In [20]:
wiki

<1x411040 sparse matrix of type '<class 'numpy.float64'>'
	with 143 stored elements in Compressed Sparse Row format>

## nearest neighbors in DB Pedia

In [21]:
distances, indices = nearest_neighbors.kneighbors(X = wiki , n_neighbors=11 )

# Retrieve the original texts for the nearest neighbors
# nearest_texts = original_texts.iloc[indices.flatten()]



In [22]:
neighbors_index = list(indices[0][:])

In [23]:
neighbors_distance = list(distances[0][:])

In [24]:
zipped = list(zip(neighbors_index,neighbors_distance))

In [25]:
zipped

[(36059, 0.0),
 (38372, 1.2464090352778006),
 (3637, 1.2568380728598036),
 (9463, 1.271322587553191),
 (14875, 1.2737221791597961),
 (11864, 1.2737662518328685),
 (28990, 1.2797301532698064),
 (3247, 1.2805364955967193),
 (10498, 1.2821199200474527),
 (19543, 1.2822398791847311),
 (6136, 1.284437727953191)]

In [26]:
db_name = df.iloc[page]['name']

In [27]:
df_DBneighbors = df.iloc[neighbors_index]


In [28]:
DB_nn = pd.Series(distances[0][:],name = 'distances' , index= indices[0][:])
# df_Wiki_Neighbors = pd.concat([df_Wiki_Neighbors,wiki_nn], axis = 0)
df_DBneighbors = df_DBneighbors.join(DB_nn,)

In [29]:
df_DBneighbors

Unnamed: 0,URI,name,text,distances
36059,<http://dbpedia.org/resource/Kenneth_Colley>,Kenneth Colley,born 7 december 1937 english actor longtime ch...,0.0
38372,<http://dbpedia.org/resource/Liam_Neeson>,Liam Neeson,john obe born 7 june 1952 irish actor 1976 joi...,1.246409
3637,<http://dbpedia.org/resource/John_E._Hudgens>,John E. Hudgens,e born april 6 1967 memphis tennessee independ...,1.256838
9463,<http://dbpedia.org/resource/Clint_Eastwood>,Clint Eastwood,clinton jr born may 31 1930 american actor dir...,1.271323
14875,<http://dbpedia.org/resource/John_Hurt>,John Hurt,sir vincent cbe born 22 january 1940 english a...,1.273722
11864,<http://dbpedia.org/resource/Dominic_Mafham>,Dominic Mafham,english actor born 11 march 1968 trained brist...,1.273766
28990,<http://dbpedia.org/resource/Jeremy_Irvine>,Jeremy Irvine,william fredric smith born july 1990 known pro...,1.27973
3247,<http://dbpedia.org/resource/James_Fisher_(act...,James Fisher (actor),b 20 april 1972 walthamstow london actor young...,1.280536
10498,<http://dbpedia.org/resource/David_Morrissey>,David Morrissey,mark born 21 june 1964 english actor director ...,1.28212
19543,<http://dbpedia.org/resource/J._J._Cohen>,J. J. Cohen,jeffrey jay born june 22 1965 actor appeared f...,1.28224


### Sentiment scores for each neighbor

In [30]:
df_DBneighbors.loc[:, 'sentiment'] = df_DBneighbors['text'].apply(lambda x: TextBlob(x).sentiment)

In [31]:
df_dict={}
df_dict['DB'] = df_DBneighbors

In [32]:
df_dict['DB']

Unnamed: 0,URI,name,text,distances,sentiment
36059,<http://dbpedia.org/resource/Kenneth_Colley>,Kenneth Colley,born 7 december 1937 english actor longtime ch...,0.0,"(-0.02725563909774437, 0.44761904761904764)"
38372,<http://dbpedia.org/resource/Liam_Neeson>,Liam Neeson,john obe born 7 june 1952 irish actor 1976 joi...,1.246409,"(0.4321428571428571, 0.3964285714285714)"
3637,<http://dbpedia.org/resource/John_E._Hudgens>,John E. Hudgens,e born april 6 1967 memphis tennessee independ...,1.256838,"(0.1340909090909091, 0.3640151515151515)"
9463,<http://dbpedia.org/resource/Clint_Eastwood>,Clint Eastwood,clinton jr born may 31 1930 american actor dir...,1.271323,"(0.18403263403263398, 0.2663403263403263)"
14875,<http://dbpedia.org/resource/John_Hurt>,John Hurt,sir vincent cbe born 22 january 1940 english a...,1.273722,"(0.11570165945165946, 0.47469336219336217)"
11864,<http://dbpedia.org/resource/Dominic_Mafham>,Dominic Mafham,english actor born 11 march 1968 trained brist...,1.273766,"(0.15535511363636365, 0.3047774621212121)"
28990,<http://dbpedia.org/resource/Jeremy_Irvine>,Jeremy Irvine,william fredric smith born july 1990 known pro...,1.27973,"(0.15641025641025644, 0.4314102564102565)"
3247,<http://dbpedia.org/resource/James_Fisher_(act...,James Fisher (actor),b 20 april 1972 walthamstow london actor young...,1.280536,"(-0.04038857967429396, 0.4226654298082869)"
10498,<http://dbpedia.org/resource/David_Morrissey>,David Morrissey,mark born 21 june 1964 english actor director ...,1.28212,"(0.06746031746031747, 0.21904761904761902)"
19543,<http://dbpedia.org/resource/J._J._Cohen>,J. J. Cohen,jeffrey jay born june 22 1965 actor appeared f...,1.28224,"(0.07142857142857142, 0.22862811791383222)"


# Part 2: The SQL
Of the 10 Nearest Neighbors, Find their Whole Wiki Page and Order Their Indecies by the Whole Page.

In [33]:
%%capture
!pip3 install wikipedia-api

In [34]:
import wikipediaapi
wikipedia = wikipediaapi.Wikipedia(user_agent = 'abooboo')

In [35]:
wiki

<1x411040 sparse matrix of type '<class 'numpy.float64'>'
	with 143 stored elements in Compressed Sparse Row format>

In [36]:
Person = db_name
person_text = wikipedia.page(Person)
person_text.title
person_text.text

'Kenneth Colley (born 7 December 1937) is an English film and television actor whose career spans over 60 years. He came to wider prominence through his role as Admiral Piett in the Star Wars films The Empire Strikes Back (1980) and Return of the Jedi (1983).\n\nCareer\nColley was born in Manchester, Lancashire. One of his early appearances on British television was as Noah Riley in the 1970s police drama The Sweeney, in an episode entitled Trap. He played Jesus in The Life of Brian, having also appeared in the earlier Monty Python-related production Ripping Yarns episode "The Testing of Eric Olthwaite" alongside Michael Palin. As a Shakespearean actor he played the Duke of Vienna in the BBC Television Shakespeare production of Measure for Measure in 1979.Colley worked extensively with British director Ken Russell from the early 1970s to the early 1990s as part of a repertory of actors who appeared across Russell\'s television and film work. He played the role of Modest Tchaikovsky in 

In [37]:
df_Wiki_Neighbors = pd.DataFrame()
df_Wiki_Neighbors['name'] = df_DBneighbors['name'].apply(lambda x: wikipedia.page(x).title)
df_Wiki_Neighbors['text'] = df_Wiki_Neighbors['name'].apply(lambda x: wikipedia.page(x).text )
df_Wiki_Neighbors['sentiment'] = df_Wiki_Neighbors['text'].apply(lambda x: TextBlob(x).sentiment)

In [38]:
df_Wiki_Neighbors = df_Wiki_Neighbors.reset_index(drop=True)

In [39]:

processed_wiki_df = process_text(df_Wiki_Neighbors,on = 'text', optional_dict= 'name')


In [40]:
processed_wiki_df = processed_wiki_df.reset_index(drop = True)

In [41]:
processed_wiki_df

Unnamed: 0,text
0,born 7 december 1937 english film television a...
1,william john born 7 june 1952 actor northern i...
2,e born april 6 1967 memphis tennessee independ...
3,clinton jr born may 31 1930 american actor fil...
4,sir vincent 22 january 1940 25 january 2017 en...
5,born 11 march 1968 english stage film televisi...
6,william fredric smith born 18 june 1990 known ...
7,b 20 april 1972 walthamstow london actor produ...
8,mark joseph born 21 june 1964 english actor fi...
9,jeffrey jay born june 22 1965 american actor a...


In [42]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tf_idf_matrix = tfidf_vectorizer.fit_transform(processed_wiki_df['text'])
nearest_neighbors = NearestNeighbors().fit(tf_idf_matrix)

In [43]:
page = 0
wiki = tf_idf_matrix[page]
distances, indices = nearest_neighbors.kneighbors(X = wiki , n_neighbors=11 )

In [44]:
distances[0]

array([0.        , 1.17964999, 1.21454257, 1.21681827, 1.23130463,
       1.23237269, 1.24386084, 1.26514631, 1.26723744, 1.29061357,
       1.33887193])

In [45]:
wiki_nn = pd.Series(distances[0][:],name = 'distances' , index= indices[0][:])
# df_Wiki_Neighbors = pd.concat([df_Wiki_Neighbors,wiki_nn], axis = 0)
df_Wiki_Neighbors = df_Wiki_Neighbors.join(wiki_nn,)

In [46]:
df_Wiki_Neighbors

Unnamed: 0,name,text,sentiment,distances
0,Kenneth Colley,Kenneth Colley (born 7 December 1937) is an En...,"(0.11800595238095239, 0.31860119047619045)",0.0
1,Liam Neeson,William John Neeson (born 7 June 1952) is an ...,"(0.10432826694619159, 0.3513676749525802)",1.232373
2,John E. Hudgens,"John E. Hudgens (born April 6, 1967, in Memphi...","(0.14959770114942525, 0.4005747126436781)",1.265146
3,Clint Eastwood,"Clinton Eastwood Jr. (born May 31, 1930) is an...","(0.1160429156164843, 0.3943730944070845)",1.216818
4,John Hurt,Sir John Vincent Hurt (22 January 1940 – 25 J...,"(0.135275965937401, 0.3132502649318791)",1.214543
5,Dominic Mafham,Dominic Mafham (born 11 March 1968) is an Engl...,"(0.15620202020202023, 0.2928236208236209)",1.243861
6,Jeremy Irvine,Jeremy William Fredric Smith (born 18 June 199...,"(0.11019444444444448, 0.34251388888888884)",1.267237
7,James Fisher (actor),"James Fisher (b. 20 April 1972 Walthamstow, Lo...","(0.12268170426065166, 0.30369674185463663)",1.338872
8,David Morrissey,David Mark Joseph Morrissey (born 21 June 1964...,"(0.08424209628755085, 0.3288752084855978)",1.17965
9,J. J. Cohen,"Jeffrey Jay Cohen (born June 22, 1965) is an A...","(0.054411764705882354, 0.2419467787114846)",1.290614


# Part3 Interactivity


In [None]:
# Necessary imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from textblob import TextBlob
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

# Assuming TextProcessor is a custom class you've defined elsewhere
# from your_custom_module import TextProcessor

def get_nearest_neighbors(df, page, k=10):
    """
    Returns the nearest neighbors from DB Pedia and Wiki for a given page.

    Args:
      df: The DataFrame containing the processed text.
      page: The index of the page to find nearest neighbors for.
      k: The number of nearest neighbors to return.

    Returns:
      A dictionary containing DataFrames of nearest neighbors from DB Pedia and Wiki.
    """

    # Ensure the necessary libraries are imported above


    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tf_idf_matrix = tfidf_vectorizer.fit_transform(df['text'])
    nearest_neighbors = NearestNeighbors().fit(tf_idf_matrix)
    wiki = tf_idf_matrix[page]
    db_distances, db_indices = nearest_neighbors.kneighbors(X=wiki, n_neighbors=k+1)

    # DB DataFrame
    df_DBneighbors = df.iloc[db_indices[0]]
    df_DBneighbors = df_DBneighbors.drop(columns=['URI'], errors='ignore')
    df_DBneighbors['sentiment'] = df_DBneighbors['text'].apply(lambda x: TextBlob(x).sentiment)
    df_DBneighbors['distances'] = db_distances[0]


    # Wiki DataFrame
    df_DBneighbors['wiki_name'] = df_DBneighbors['name'].apply(lambda x: wikipedia.page(x).title)
    df_DBneighbors['wiki_text'] = df_DBneighbors['wiki_name'].apply(lambda x: wikipedia.page(x).text)  # Use summary for brevity
    df_DBneighbors['wiki_sentiment'] = df_DBneighbors['wiki_text'].apply(lambda x: TextBlob(x).sentiment)
    wiki_text = process_text(df_DBneighbors,on = 'wiki_text', optional_dict= 'wiki_name')
    # Ensure the necessary libraries are imported above

    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tf_idf_matrix = tfidf_vectorizer.fit_transform(wiki_text['text'])
    nearest_neighbors = NearestNeighbors().fit(tf_idf_matrix)
    wiki = tf_idf_matrix[0]
    db_distances, db_indices = nearest_neighbors.kneighbors(X=wiki, n_neighbors=k+1)
    df_DBneighbors['wiki_distances'] = db_distances[0]

    # df_DBneighbors

    df_DBneighbors['pol1'] = df_DBneighbors['sentiment'].apply(lambda x: x[0])
    df_DBneighbors['sub1'] = df_DBneighbors['sentiment'].apply(lambda x: x[1])

    df_DBneighbors['pol2'] = df_DBneighbors['wiki_sentiment'].apply(lambda x: x[0])
    df_DBneighbors['sub2'] = df_DBneighbors['wiki_sentiment'].apply(lambda x: x[1])

    output = df_DBneighbors


    fig, ax = plt.subplots(figsize=(12, 6))  # Increase figure width

    # Define a list of markers to cycle through
    markers = ['o', 'v', '^', '<', '>', 's', 'p', '*', '+', 'x','D']

    # Placeholder for custom legend icons
    legend_icons = []

    mark_i = 0
    # Loop through the DataFrame and plot each point with a different marker
    for i, row in output.iterrows():
        scatter1 = ax.scatter(row['pol1'], row['sub1'], color='blue', label='DB' if i == 0 else "", marker=markers[mark_i])
        scatter2 = ax.scatter(row['pol2'], row['sub2'], color='red', label='wiki' if i == 0 else "", marker=markers[mark_i])
        # Create a custom legend icon for each name
        legend_icon = mlines.Line2D([], [], color='black', marker=markers[mark_i], linestyle='None', markersize=10, label=row['name'])
        legend_icons.append(legend_icon)
        mark_i += 1

    # Set labels and title
    ax.set_xlabel('Polarity')
    ax.set_ylabel('Subjectivity')
    ax.set_title('Sentiment Analysis: Polarity vs. Subjectivity')

    # Create the first legend manually and specify its position outside the plot area
    first_legend = ax.legend(handles=legend_icons, loc='upper left', bbox_to_anchor=(1, 1), title='Names')
    # Add the first legend manually


    second_legend = ax.legend([scatter1, scatter2], ['DB', 'wiki'], loc='lower left', bbox_to_anchor=(1, 0))

    # Create and add the second legend for the scatter plot colors/labels, also outside
    plt.legend(handles=legend_icons, loc='upper left', bbox_to_anchor=(1, 1), title='Names')
    ax.add_artist(second_legend)
    plt.subplots_adjust(right=.75)  # Adjust subplot to create more space on the right

    plt.show()

    return df_DBneighbors


# Example usage (assuming 'df' and 'condition' are defined in your context)
search = 'Franz Rottensteiner'
condition = df[df['name'] == search].index[0]
output = get_nearest_neighbors(df, page=condition)


In [None]:
output

# USER INPUT

Exact match

In [None]:
def name_search():
  search = input("Enter a name from the DataFrame: ")
  print('loading...')
  condition = df[df['name'] == search].index[0]
  output = get_nearest_neighbors(df, page=condition)
  print('DING!')

In [None]:
my_neighbors = name_search()

In [None]:
import numpy as np
import pandas as pd
import textdistance


def get_closest_names(df, name, k=5):
    """
    Returns the k closest names to the given name in the DataFrame using Levenshtein distance.

    Args:
        df: The DataFrame containing the names.
        name: The name to find the closest names to.
        k: The number of closest names to return.

    Returns:
        A list of the k closest names.
    """

    # Calculate the Levenshtein distance between the given name and all other names
    distances = df['name'].apply(lambda x: textdistance.levenshtein(name.lower(), x.lower()))

    # Sort the distances in ascending order and get the indices of the k closest names
    sorted_indices = np.argsort(distances)[:k]

    # Return the names corresponding to the closest indices
    return df['name'].iloc[sorted_indices].tolist()



In [None]:
# Example usagew
search = input("Enter a name from the DataFrame: ")
closest_names = get_closest_names(df, search)
print("The 5 closest names to", search, "are:", closest_names)


In [None]:
import pandas as pd
import numpy as np
import textdistance

# Sample DataFrame
# df = pd.DataFrame({'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve']})

def get_closest_names(df, name, k=5):
    """
    Returns the k closest names to the given name in the DataFrame using Levenshtein distance.

    Args:
        df: The DataFrame containing the names.
        name: The name to find the closest names to.
        k: The number of closest names to return.

    Returns:
        A list of the k closest names.
    """
    distances = df['name'].apply(lambda x: textdistance.levenshtein(name.lower(), x.lower()))
    sorted_indices = np.argsort(distances)[:k]
    return df['name'].iloc[sorted_indices].tolist()

def execute_function_with_name(search):
    """
    Placeholder for executing a function with the selected name.

    Args:
        name: The selected name to use in the function.
    """
    print('loading...')
    condition = df[df['name'] == search].index[0]
    output = get_nearest_neighbors(df, page=condition)
    print('DING!')
    # Placeholder functionality
    print(f"Function executed with {search}.")


In [None]:
# Interactive process
search = input("Enter a name from the DataFrame: ")
closest_names = get_closest_names(df, search)
print("The closest names to", search, "are:")
for index, name in enumerate(closest_names, start=1):
    print(f"{index}. {name}")

selected_index = int(input("Select a name by entering the corresponding number: ")) - 1
if 0 <= selected_index < len(closest_names):
    selected_name = closest_names[selected_index]
    print(f"You selected: {selected_name}")
    execute_function_with_name(selected_name)
else:
    print("Invalid selection. Please run the process again and select a valid number.")


In [None]:
m= ['o', 'v', '^', '<', '>', 's', 'p', '*', '+', 'x','^']

In [None]:
counter = 0
for i,k in output.iterrows():
  print(m[counter])
  counter +=1

In [None]:
m[0]