Connected to base (Python 3.11.5)

In [1]:
# program		vandenburg_demo_tsne_irisData_v1.py
# purpose	    Demonstrate perceptron with iris data
# usage         script
# notes         (1)
# date			2/14/2024
# programmer   Colton Vandenburg

import datetime          # used for getting the date
import os                # used for getting the basic file name (returns lower case)
from sklearn.manifold import TSNE as tsne   # t-distributed stochastic neighbor embedding
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import preprocessing    # Import label encoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
import seaborn as sns

# ============== COMMON INITIALIZATION =====================
date_o = datetime.datetime.today()
date_c = date_o.strftime('%m/%d/%Y')


script_dir = os.path.dirname(os.path.abspath(__file__))     # Get the directory of the current script
csv_file_path, programName_c = os.path.join(script_dir, 'irisData_modified.csv')   # Construct the full path to the CSV file

irisData_df = pd.read_csv(csv_file_path)    # Load the CSV file

ix = str.find(programName_c,'.')

fileName_c = 'irisData_modified.csv'
programMsg_c = programName_c + ' (' + date_c + ')'


authorName_c = 'Colton Vandenburg'
figName_c = programName_c[:ix]+'_fig.png'


# ========== get data frame, split, get numerical data ==============
irisData_df = pd.read_csv((fileName_c))
irisData_featureNames_cv = irisData_df.columns
irisData_featureNames_cv = irisData_featureNames_cv[:4]     # remove type column

tts_ts = 0.3
tts_rs = 24     #26 performs poorly - why?
irisDataT_df, irisDataV_df = train_test_split(irisData_df,test_size = tts_ts, random_state=tts_rs) 
irisDataT_num_df = irisDataT_df.drop("type", axis = 1)    # numerical data only

# ============== get labels, preprocess train data except for scaling  ======================
irisDataT_labels = irisDataT_df["type"]
label_encoder = preprocessing.LabelEncoder()  
irisDataT_labels_v = label_encoder.fit_transform(irisDataT_labels)

imputer = SimpleImputer(strategy = "median")
imputer.fit(irisDataT_num_df)
irisDataT_num_df = imputer.transform(irisDataT_num_df)

# ================= get labels, preprocess verify data ==================
irisDataV_num_df = irisDataV_df.drop("type", axis = 1)    # numerical data only
irisDataV_labels = irisDataV_df["type"]
irisDataV_labels_v = label_encoder.fit_transform(irisDataV_labels) 

irisDataV_num_df = imputer.transform(irisDataV_num_df)

# ================ perceptron, non-scaled ===================
ppn_eta = .01
ppn_rs = 1
ppn_T = Perceptron(eta0 = ppn_eta, random_state = ppn_rs)    # what is best value for eta?
ppn_T.fit(irisDataT_num_df, irisDataT_labels_v)

predict_T = ppn_T.predict(irisDataT_num_df)         # try on test data (should be better than verify)
accuracy_T = accuracy_score(irisDataT_labels_v, predict_T)
predict_V = ppn_T.predict(irisDataV_num_df)         # now do on verify data
accuracy_V = accuracy_score(irisDataV_labels_v, predict_V)

print('accuracy_T = ',accuracy_T)
print('accuracy_V = ',accuracy_V)

# ================ perceptron, standardized ===================
sc = StandardScaler()       # 0-mean, sigma=1; performs better than minMaxScaler(), which does [0,1]
sc.fit(irisDataT_num_df)    # do the fit step on the training data
irisDataTs_num_df = sc.transform(irisDataT_num_df)  # apply transform on train data
irisDataVs_num_df = sc.transform(irisDataV_num_df)  # do same transform on verify data

ppn_T.fit(irisDataTs_num_df, irisDataT_labels_v)

predict_Ts = ppn_T.predict(irisDataTs_num_df)         # try on test data (should be better than verify)
accuracy_Ts = accuracy_score(irisDataT_labels_v, predict_Ts)
predict_Vs = ppn_T.predict(irisDataVs_num_df)         # now do on verify data
accuracy_Vs = accuracy_score(irisDataV_labels_v, predict_Vs)

print('accuracy_Ts = ',accuracy_Ts)
print('accuracy_Vs = ',accuracy_Vs)

# ================= perceptron, normalized ==================
scn = MinMaxScaler()        # does not do as well as StandardScaler
scn.fit(irisDataT_num_df)    # do the fit step on the training data
irisDataTn_num_df = scn.transform(irisDataT_num_df)  # apply transform on train data
irisDataVn_num_df = scn.transform(irisDataV_num_df)  # do same transform on verify data

ppn_T.fit(irisDataTn_num_df, irisDataT_labels_v)

predict_Tn = ppn_T.predict(irisDataTn_num_df)         # try on test data (should be better than verify)
accuracy_Tn = accuracy_score(irisDataT_labels_v, predict_Tn)
predict_Vn = ppn_T.predict(irisDataVn_num_df)         # now do on verify data
accuracy_Vn = accuracy_score(irisDataV_labels_v, predict_Vn)

print('accuracy_Tn = ',accuracy_Tn)
print('accuracy_Vn = ',accuracy_Vn)

# ===================== prepare to plot results ===================
msg_tts_c = "test_train_split: tts_ts = " + "%1.2f" %(tts_ts) + "; tts_rs = " + str(tts_rs)
msg_ppn_c = "perceptron: ppn_eta = " + "%1.2f" %(ppn_eta) + "; ppn_rs = " + str(ppn_rs)
msg_plot_c = msg_tts_c + '; ' + msg_ppn_c

msg_acc1_c = "Accuracy: T = " + "%1.2f" %(accuracy_T) + "; V = " + "%1.2f" %(accuracy_V)
msg_acc2_c = "Ts = " + "%1.2f" %(accuracy_Ts) + "; Vs = " + "%1.2f" %(accuracy_Vs)
msg_acc3_c = "Tn = " + "%1.2f" %(accuracy_Tn) + "; Vs = " + "%1.2f" %(accuracy_Vn)

msg_acc_c = msg_acc1_c + '; ' + msg_acc2_c + '; ' + msg_acc3_c

# ======================== PLOT RESULTS ======================
#plt.figure(num=1, figsize=(11.2, 5.2))        # not consistent with boxplot
plt.rcParams.update({'font.size': 9})           # 8-point fits a little better but still overlaps
fig, axes = plt.subplots(2, 2, figsize=(11.2, 5.2))
fig.suptitle(msg_acc_c,fontsize=9)
sns.set(style='whitegrid')

sns.set(font_scale = 0.7)   # make fonts a little smaller on the box plot
for kp, feature in enumerate(irisData_featureNames_cv):
    sns.boxplot(x='type', y=feature, hue='type', data=irisData_df, ax=axes[kp//2, kp%2], palette='pastel')


# ============= Make the subplots look a little nicer ================= 
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.15, wspace=0.2, hspace=0.3)

# ================= label plot edges ==================
plt.subplot(position=[0.0500,    0.93,    0.02500,    0.02500]) # U-left
plt.axis('off')
plt.text(0,.5, programMsg_c, fontsize=8)

plt.subplot(position=[0.550,    0.93,    0.02500,    0.02500]) # U-right
plt.axis('off')
plt.text(0,.5, authorName_c, fontsize=8)

plt.subplot(position=[0.0500,    0.02,    0.02500,    0.02500]) # L-left
plt.axis('off')
plt.text(0,.5, fileName_c, fontsize=8)

plt.subplot(position=[0.3500,    0.02,    0.02500,    0.02500]) # L-right
plt.axis('off')
plt.text(0,.5, msg_plot_c, fontsize=8)


plt.savefig(figName_c)

plt.show()

ValueError: too many values to unpack (expected 2)

In [2]:
# program		data_preprocess.py
# purpose	    Proprocess and standardize for training data
# usage         write_dialogue_to_file('filename')
#               read_text_file('filename')
# notes         (1) 
# date			2/7/2024
# programmer    Colton Vandenburg
import json
import string
import datetime
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import json
import datetime
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder


def write_dialogue_to_file(filename):
    filename = f"{filename}_{datetime.date.today()}.txt"
    with open(filename, 'w') as file:
        while True:
            myWords = input("Enter dialogue for Me (or type 'exit' to quit): ")
            if myWords.lower() == 'exit':
                break
            file.write(f"Speaker 1: {myWords}\n")
            
            friend_words = input("Enter dialogue for Friend (or type 'exit' to quit): ")
            if friend_words.lower() == 'exit':
                if myWords.lower() == 'exit':
                    return
                break
            file.write(f"Speaker 2: {friend_words}\n")


#write_dialogue_to_file('conversation')



def read_text_file(filename):
    with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.readlines()
    content = [line.encode('ascii', 'ignore').decode('ascii') for line in content]
    return content


filename = 'conversation.txt'
content = read_text_file(filename)

def preprocess_conversations(conversations):
    processed_conversations = []
    label_encoder = LabelEncoder()

    for conversation in conversations:
        processed_conversation = []
        for utterance in conversation:
            # Tokenization, Lowercasing, Remove Punctuation, Remove Empty Tokens
            tokens = [token.lower().translate(str.maketrans('', '', string.punctuation)) for token in utterance.split() if token]
            processed_conversation.append(' '.join(tokens))
        processed_conversations.append(processed_conversation)

    # Vectorize the labels using LabelEncoder
    encoded_labels = label_encoder.fit_transform(range(len(processed_conversations)))

    return processed_conversations, encoded_labels
processed_content = preprocess_conversations(content)

from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on your preprocessed content
vectorizer.fit(processed_content)

# Transform the preprocessed content into vectors
vectorized_content = vectorizer.transform(processed_content)

# Now you can use the vectorized content for your machine learning model

FileNotFoundError: [Errno 2] No such file or directory: 'conversation.txt'

In [3]:
# program		data_preprocess.py
# purpose	    Proprocess and standardize for training data
# usage         write_dialogue_to_file('filename')
#               read_text_file('filename')
# notes         (1) 
# date			2/7/2024
# programmer    Colton Vandenburg
import json
import string
import datetime
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import json
import datetime
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder


def write_dialogue_to_file(filename):
    filename = f"{filename}_{datetime.date.today()}.txt"
    with open(filename, 'w') as file:
        while True:
            myWords = input("Enter dialogue for Me (or type 'exit' to quit): ")
            if myWords.lower() == 'exit':
                break
            file.write(f"Speaker 1: {myWords}\n")
            
            friend_words = input("Enter dialogue for Friend (or type 'exit' to quit): ")
            if friend_words.lower() == 'exit':
                if myWords.lower() == 'exit':
                    return
                break
            file.write(f"Speaker 2: {friend_words}\n")


#write_dialogue_to_file('conversation')



def read_text_file(filename):
    with open(filename, 'r', encoding='utf-8', errors='ignore') as file:
        content = file.readlines()
    content = [line.encode('ascii', 'ignore').decode('ascii') for line in content]
    return content


filename = 'nkl341/EE497_ML/Project/conversation.txt'
content = read_text_file(filename)

def preprocess_conversations(conversations):
    processed_conversations = []
    label_encoder = LabelEncoder()

    for conversation in conversations:
        processed_conversation = []
        for utterance in conversation:
            # Tokenization, Lowercasing, Remove Punctuation, Remove Empty Tokens
            tokens = [token.lower().translate(str.maketrans('', '', string.punctuation)) for token in utterance.split() if token]
            processed_conversation.append(' '.join(tokens))
        processed_conversations.append(processed_conversation)

    # Vectorize the labels using LabelEncoder
    encoded_labels = label_encoder.fit_transform(range(len(processed_conversations)))

    return processed_conversations, encoded_labels
processed_content = preprocess_conversations(content)

from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer on your preprocessed content
vectorizer.fit(processed_content)

# Transform the preprocessed content into vectors
vectorized_content = vectorizer.transform(processed_content)

# Now you can use the vectorized content for your machine learning model

FileNotFoundError: [Errno 2] No such file or directory: 'nkl341/EE497_ML/Project/conversation.txt'

In [4]:
content = read_text_file('Project/conversation.txt')

FileNotFoundError: [Errno 2] No such file or directory: 'Project/conversation.txt'